コード例 #1
0
    def consume(self, targets):
        print(targets)
        nm = NmapProcess(targets, options='-v -sn')
        rc = nm.run()

        try:
            parsed = NmapParser.parse(nm.stdout)
        except NmapParserException as e:
            print("Exception raised while parsing scan: %s" % (e.msg))

        HOST_UP = 1
        HOST_DOWN = 0

        scans = Table('host_up', connection=self.dynamo)

        with scans.batch_write() as batch:
            for host in parsed.hosts:
                # Insert into database and delete from queue
                if (host.status == 'down'):
                    status = 0
                elif (host.status == 'up'):
                    status = 1
                else:
                    status = -1

                batch.put_item(
                    data={
                        'ip': host.address,
                        'status': status,
                        'datetime': int(time.time())
                    })
コード例 #2
0
def main():
    sqs_cnx = boto.sqs.connect_to_region('us-east-1')
    queue = sqs_cnx.get_queue('reviewboard-slack-logs')
    queue.set_message_class(RawMessage)

    table = Table('reviewboard-slack-logs')

    while 1:
        messages = queue.get_messages(num_messages=10, wait_time_seconds=10)
        to_delete = []

        if messages:
            with table.batch_write() as batch:
                for message in messages:
                    body = message.get_body()

                    try:
                        data = json.loads(body)
                        attrs = {
                            key: value
                            for key, value in data.iteritems()
                            if value != ''
                        }

                        print data['timestamp']
                        batch.put_item(attrs)
                        to_delete.append(message)
                    except ValueError:
                        print '??? %r' % body

            sqs_cnx.delete_message_batch(queue, messages)
        else:
            break
コード例 #3
0
def dynamo_main():
    conn = dynamodb2.connect_to_region(region_name=region,
                                       aws_access_key_id=access_key,
                                       aws_secret_access_key=secret_key)
    prop_appl_table = Table(dynamodb_prop_appl_table, connection=conn)
    table = Table(dynamodb_diag_rep_table, connection=conn)

    prop_appl_res = prop_appl_table.scan()

    prop_appl_ids = []

    for prop_appl in prop_appl_res:
        prop_appl_ids.append(dict(prop_appl.get_raw_keys())['propApplId']['S'])

    with table.batch_write() as batch:
        for i in range(0, 20):
            diag_rep = {}
            global next_id
            diag_rep['diagRepId'] = str(next_id)
            next_id += 1
            diag_rep['propApplId'] = prop_appl_ids[randint(
                0,
                len(prop_appl_ids) - 1)]
            diag_rep['managerId'] = manager_ids[randint(
                0,
                len(manager_ids) - 1)]
            diag_rep['organisations'] = ["1"]
            diag_rep['timestamp'] = int(time.time())
            diag_rep['description'] = "Description"
            batch.put_item(data=diag_rep)
コード例 #4
0
ファイル: UpScanner.py プロジェクト: ss23/ssy-scanner
    def consume(self, targets):
        print(targets)
        nm = NmapProcess(targets, options='-v -sn')
        rc = nm.run()

        try:
            parsed = NmapParser.parse(nm.stdout)
        except NmapParserException as e:
            print("Exception raised while parsing scan: %s" % (e.msg))

        HOST_UP = 1
        HOST_DOWN = 0

        scans = Table('host_up', connection=self.dynamo)

        with scans.batch_write() as batch:
            for host in parsed.hosts:
                # Insert into database and delete from queue
                if (host.status == 'down'):
                    status = 0
                elif (host.status == 'up'):
                    status = 1
                else:
                    status = -1

                batch.put_item(data={
                    'ip': host.address,
                    'status': status,
                    'datetime': int(time.time())
                })
コード例 #5
0
def data():
    from boto import dynamodb2
    from boto.dynamodb2.table import Table

    TABLE_NAME = "issdata"
    REGION = "us-west-1"

    conn = dynamodb2.connect_to_region(
        REGION,
        aws_access_key_id=os.environ['AWS_ACCESS_KEY_ID'],
        aws_secret_access_key=os.environ['AWS_SECRET_ACCESS_KEY'],
    )
    table = Table(TABLE_NAME, connection=conn)

    absolute_junk = {
        "favorite_color": "blue",
        "quest": "seek_holy_grail",
    }

    with table.batch_write() as table_batch:
        for example_counter in xrange(10):

            required_hash_data = {
                "user_id": 11,
                "timestamp":
                datetime_to_timestamp_ms(datetime.datetime.utcnow())
            }

            final_dynamo_data = dict(absolute_junk.items() +
                                     required_hash_data.items())
            table_batch.put_item(data=final_dynamo_data)
コード例 #6
0
def do_insert():
    conn = boto.dynamodb2.connect_to_region(region)
    table = Table(table_name, connection=conn)
    cx = sqlite3.connect(name_db)
    cu = cx.cursor()
    count = 0
    cu.execute("select * from name")
    test_count = 1000
    while True:
        rets = cu.fetchmany(batch_count)
        if len(rets) <= 0:
            break
        with table.batch_write() as batch:
            for ret in rets:
                name = ret[0]
                dates = []
                while len(dates) < count_per_user:
                    date = generate_date()
                    if date in dates:
                        continue
                    dates.append(date)
                for date in dates:
                    score = generate_score()
                    batch.put_item(data={
                        u'name': name,
                        u'date': date,
                        u'score': score
                    })
                    count += 1
        with open(u'/tmp/insert_count', u'w') as f:
            f.write(u'%s\n' % unicode(count))
    with open(u'/tmp/insert_count', u'a') as f:
        f.write(u'done\n')
コード例 #7
0
def dynamo_main():
    conn = dynamodb2.connect_to_region(region_name=region,
                                       aws_access_key_id=access_key,
                                       aws_secret_access_key=secret_key)
    diag_rep_table = Table(dynamodb_diag_rep_table, connection=conn)
    table = Table(dynamodb_main_org_table, connection=conn)

    diag_rep_res = diag_rep_table.scan()

    diag_rep_ids = []

    for diag_rep in diag_rep_res:
        diag_rep_ids.append(dict(diag_rep.get_raw_keys())['diagRepId']['S'])

    with table.batch_write() as batch:
        for i in range(0, 10):
            main_org = {}
            global next_id
            main_org['orgId'] = str(next_id)
            next_id += 1
            engineers = []
            engineers.append(engineer_ids[0])
            engineers.append(engineer_ids[1])
            main_org['engineers'] = engineers
            pending = []
            pending.append(diag_rep_ids[randint(0, len(diag_rep_ids) - 1)])
            pending.append(diag_rep_ids[randint(0, len(diag_rep_ids) - 1)])
            resp = []
            resp.append(diag_rep_ids[randint(0, len(diag_rep_ids) - 1)])
            resp.append(diag_rep_ids[randint(0, len(diag_rep_ids) - 1)])
            main_org['pendingDiagnosticReports'] = pending
            main_org['respondedDiagnosticReports'] = resp
コード例 #8
0
class DDBToBeSlurped(Dynamo):
    def __init__(self, access_key=None, secret=None):
        """
        ! Use test_mode factory method for instantiating this class with test_slurps and test_failed_slurps tables
        """
        super(DDBToBeSlurped, self).__init__(access_key, secret)

        self.table = Table('to_be_slurped', connection=self.connection)

    def save_info(self, search_terms):
        """
        search_terms can either be in the form of a list of dicts or else a single dict.
        If slurp_info is a list, batch write will be used
        """
        if isinstance(search_terms, basestring):
            search_terms = [search_terms]
        # search_terms = {'searchterm': search_terms}
        search_terms = [{'searchterm': x} for x in search_terms]
        # print search_terms
        with self.table.batch_write() as batch:
            for s in search_terms:
                batch.put_item(data=s, overwrite=True)

    def get_table(self, table_name=None):
        """
        Convenience method for client who may wish to get a specific table from the DynamoDB connection
        """
        table_name = table_name or self.table.table_name
        return Table(table_name, connection=self.connection)

    def truncate_table(self):
        """
        Delete whole table
        """
        with self.table.batch_write() as batch:
            for item in self.table.scan():
                batch.delete_item(searchterm=item['searchterm'])

    def modify_slurps_throughput(self, requested_read, requested_write):
        return self.modify_throughput(requested_read, requested_write,
                                      self.table)

    def get_slurps_table_info(self):
        return self.get_table_info(self.table)

    def get_table_as_df(self):
        return DataFrame([dict(r) for r in self.table.scan()])
コード例 #9
0
 def truncate_slurp_table(self):
     """
     WARNING! Only use for test mode table
     """
     assert self.slurps_table.table_name == 'test_slurps', "Will only truncate test slurps table. To truncate production table, run code manually"
     test_slurps_table = Table('test_slurps', connection=self.connection)
     with test_slurps_table.batch_write() as batch:
         for item in self.slurps_table.scan():
             batch.delete_item(searchterm=item['searchterm'])
コード例 #10
0
ファイル: ddb_slurps.py プロジェクト: yz-/ut
 def truncate_slurp_table(self):
     """
     WARNING! Only use for test mode table
     """
     assert self.slurps_table.table_name == 'test_slurps', "Will only truncate test slurps table. To truncate production table, run code manually"
     test_slurps_table = Table('test_slurps', connection=self.connection)
     with test_slurps_table.batch_write() as batch:
         for item in self.slurps_table.scan():
             batch.delete_item(searchterm=item['searchterm'])
コード例 #11
0
def dynamo_main(appliances):
    conn = dynamodb2.connect_to_region(region_name=region,
                                       aws_access_key_id=access_key,
                                       aws_secret_access_key=secret_key)
    table = Table(dynamodb_appliance_table, connection=conn)

    with table.batch_write() as batch:
        for appliance in appliances:
            batch.put_item(data=appliance)
コード例 #12
0
ファイル: engine.py プロジェクト: hoov/flywheel
    def save(self, items, overwrite=None):
        """
        Save models to dynamo

        Parameters
        ----------
        items : list or :class:`~flywheel.models.Model`
        overwrite : bool, optional
            If False, raise exception if item already exists (default set by
            :attr:`.default_conflict`)

        Raises
        ------
        exc : :class:`boto.dynamodb2.exceptions.ConditionalCheckFailedException`
            If overwrite is False and an item already exists in the database

        Notes
        -----
        Overwrite will replace the *entire* item with the new one, not just
        different fields. After calling save(overwrite=True) you are guaranteed
        that the item in the database is exactly the item you saved.

        Due to the structure of the AWS API, saving with overwrite=True is much
        faster because the requests can be batched.

        """
        if overwrite is None:
            overwrite = self.default_conflict in ('update', 'overwrite')
        if isinstance(items, Model):
            items = [items]
        if not items:
            return
        tables = defaultdict(list)
        for item in items:
            tables[item.meta_.ddb_tablename].append(item)
        for tablename, items in tables.iteritems():
            table = Table(tablename, connection=self.dynamo)
            if overwrite:
                with table.batch_write() as batch:
                    for item in items:
                        item.pre_save_(self)
                        batch.put_item(data=item.ddb_dump_())
                        item.post_save_()
            else:
                for item in items:
                    expected = {}
                    for name in item.meta_.fields:
                        expected[name] = {
                            'Exists': False,
                        }
                    item.pre_save_(self)
                    boto_item = Item(table, data=item.ddb_dump_())
                    self.dynamo.put_item(tablename, boto_item.prepare_full(),
                                         expected=expected)
                    item.post_save_()
コード例 #13
0
ファイル: engine.py プロジェクト: hoov/flywheel
    def delete(self, items, raise_on_conflict=None):
        """
        Delete items from dynamo

        Parameters
        ----------
        items : list or :class:`~flywheel.model.Model`
            List of :class:`~flywheel.models.Model` objects to delete
        raise_on_conflict : bool, optional
            If True, raise exception if the object was changed concurrently in
            the database (default set by :attr:`.default_conflict`)

        Raises
        ------
        exc : :class:`boto.dynamodb2.exceptions.ConditionalCheckFailedException`
            If overwrite is False and an item already exists in the database

        Notes
        -----
        Due to the structure of the AWS API, deleting with
        raise_on_conflict=False is much faster because the requests can be
        batched.

        """
        if raise_on_conflict is None:
            raise_on_conflict = self.default_conflict == 'raise'
        if isinstance(items, Model):
            items = [items]
        if not items:
            return
        tables = defaultdict(list)
        for item in items:
            tables[item.meta_.ddb_tablename].append(item)

        count = 0
        for tablename, items in tables.iteritems():
            if raise_on_conflict:
                for item in items:
                    expected = item.construct_ddb_expects_()
                    count += 1
                    self.dynamo.delete_item(tablename, item.pk_dict_,
                                            expected=expected)
            else:
                table = Table(tablename, connection=self.dynamo)
                with table.batch_write() as batch:
                    for item in items:
                        if isinstance(item, Model):
                            keys = item.pk_dict_
                        else:
                            keys = dict(item)
                        count += 1
                        batch.delete_item(**keys)
        return count
コード例 #14
0
def dynamo_main(appliance_statuses, stat_icon):
    conn = dynamodb2.connect_to_region(region_name=region,
                                       aws_access_key_id=access_key,
                                       aws_secret_access_key=secret_key)
    table = Table(dynamodb_appliance_status_table, connection=conn)

    with table.batch_write() as batch:
        for appliance_stat in appliance_statuses:
            global next_id
            appliance_stat['statusId'] = str(next_id)
            appliance_stat['icon'] = stat_icon
            next_id += 1
            batch.put_item(data=appliance_stat)
コード例 #15
0
def add_sample_schedules():
	try:
		tb_schedules = Table('bus_n_stops', connection=cm.db)
		with tb_schedules.batch_write() as batch:
			batch.put_item(data={
				'id': '7',
				's_id': '2',
				'stop_id': 'alpy',
				'time': get_xhd_from_time(hour=12, minute=0),
				'cnt': 5,
			})
			batch.put_item(data={
				'id': '8',
				's_id': '2',
				'stop_id': 'kykm',
				'time': get_xhd_from_time(hour=12, minute=15),
				'cnt': 5,
			})
			batch.put_item(data={
				'id': '9',
				's_id': '2',
				'stop_id': 'ochr',
				'time': get_xhd_from_time(hour=12, minute=30),
				'cnt': 5,
			})
			batch.put_item(data={
				'id': '10',
				's_id': '2',
				'stop_id': 'vlkv',
				'time': get_xhd_from_time(hour=12, minute=45),
				'cnt': 5,
			})
			batch.put_item(data={
				'id': '11',
				's_id': '2',
				'stop_id': 'kpy',
				'time': get_xhd_from_time(hour=13, minute=0),
				'cnt': 5,
			})
			batch.put_item(data={
				'id': '12',
				's_id': '2',
				'stop_id': 'klm',
				'time': get_xhd_from_time(hour=13, minute=15),
				'cnt': 5,
			})
		return message_helper.success()
	except Exception, e:
		return message_helper.error(str(e))
コード例 #16
0
ファイル: ddb_kids.py プロジェクト: SRHerzog/ut
class DDBKids(Dynamo):

    @classmethod
    def from_test_mode(cls, access_key=None, secret=None):
        """
        Use this for getting an instance of this class that uses test tables.
        """
        instance = cls(access_key, secret)
        instance.table = Table('test_kids', connection=instance.connection)
        return instance

    def __init__(self, access_key=None, secret=None):
        super(DDBKids, self).__init__(access_key, secret)
        self.table = Table('kids', connection=self.connection)

    def set_max_kid(self, account_name, kid):
        """
        Set the max kid used for an account
        :param account_name:
        :param kid: int >= 0.
        """
        return self.table.put_item({'account_name': account_name, 'kid': kid}, overwrite=True)

    def get_max_kid(self, account_name):
        """
        Get the max kid already used for an account. If the account does not exist, create it in the db with a KID of 0.
        :param account_name:
        """
        res = self.table.get_item(account_name=account_name)
        if res['kid']:
            return int(res['kid'])
        else:
            self.logger.warn("Creating a new (max) KID entry for account {} because it did not yet exist in ddb_kids".format(account_name))
            self.set_max_kid(account_name, 0)
            return 0

    def modify_throughput(self, requested_read, requested_write, table=None):
        table = table or self.table
        return super(DDBKids, self).modify_throughput(requested_read, requested_write, table)

    def truncate_table(self):
        """
        WARNING! Only use for test mode table
        """
        assert self.table.table_name == 'test_kids', "Will only truncate test table. To truncate production table, run code manually"
        with self.table.batch_write() as batch:
            for item in self.table.scan():
                batch.delete_item(dt=item['dt'])
コード例 #17
0
ファイル: __init__.py プロジェクト: Fablr/Corgi-Cache
class CorgiCache:
    def __init__(self):
        self.dynamo = boto.dynamodb2.connect_to_region(DYNAMO_REGION)
        self.feeds = Table("Feeds", connection=self.dynamo)
        self.tokens = Table("Tokens", connection=self.dynamo)
        return

    def feed_id_exists(self, feed_id):
        items = list(self.feeds.query_2(ID__eq=feed_id))
        return len(items) > 0

    def put_feed(self, data):
        if "ID" not in data or "URL" not in data:
            logging.debug("invalid item, {0}".format(data))
            raise ValueError
        self.feeds.put_item(data=data)
        return

    def put_feed_batch(self, data):
        with self.feeds.batch_write() as batch:
            for item in data:
                if "ID" not in item or "URL" not in item:
                    logging.debug("invalid item, {0}".format(item))
                    raise ValueError
                batch.put_item(data=item)
        return

    def get_all_feeds(self):
        return self.feeds.scan()

    def get_token(self, use):
        items = list(self.tokens.query_2(USE__eq=use))
        if len(items) > 1:
            raise ValueError

        item = items[0]
        return item

    def update_token(self, token):
        if "USE" not in token or "TOKEN" not in token or "REFRESH_TOKEN" not in token:
            logging.debug("invalid token, {0}".format(token))
            raise ValueError
        self.tokens.put_item(data=token)
        return
コード例 #18
0
ファイル: engine.py プロジェクト: hoov/flywheel
    def delete_key(self, model, pkeys=None, **kwargs):
        """
        Delete one or more items from dynamo as specified by primary keys

        Parameters
        ----------
        model : :class:`~flywheel.models.Model`
        pkeys : list, optional
            List of primary key dicts
        **kwargs : dict
            If pkeys is None, delete only a single item and use kwargs as the
            primary key dict

        Returns
        -------
        count : int
            The number of deleted items

        Notes
        -----
        If the model being deleted has no range key, you may use strings
        instead of primary key dicts. ex:

        .. code-block:: python

            >>> class Item(Model):
            ...     id = Field(hash_key=True)
            ...
            >>> items = engine.delete_key(Item, ['abc', 'def', '123', '456'])

        """
        if pkeys is not None:
            keys = pkeys
        else:
            keys = [kwargs]

        count = 0
        table = Table(model.meta_.ddb_tablename, connection=self.dynamo)
        with table.batch_write() as batch:
            for key in keys:
                pkey = model.meta_.pk_dict(scope=key)
                batch.delete_item(**pkey)
                count += 1
        return count
コード例 #19
0
def insertData():
    conn = boto.dynamodb.connect_to_region('us-east-1');
    try:
        tdescr=conn.describe_table('consumer_complaint')
        consumer_complaint=Table('consumer_complaint')
    except:     
        consumer_complaint = createTable()
    try:    
        isTableActive='false'
        while isTableActive=='false':
            tdescr=conn.describe_table('consumer_complaint')
            if(((tdescr['Table'])['TableStatus']) == 'ACTIVE'):
                #consumer_complaint=Table('consumer_complaint')
                start_time = time.time()
                reader=downloadData()
                i=0
                for row in reader:
                                if i==0:
                                    i=i+1
                                else:
                                    with consumer_complaint.batch_write() as batch:
                                        batch.put_item(data={
                                                'Complaint_ID' : row[0],
                                                'Product' : row[1],
                                                'Sub-product' : row[2],
                                                'Issue' : row[3],
                                                'State' : row[4],
                                                'ZIP_code' : row[5],
                                                'Company' : row[6],
                                                'Company_response' : row[7],
                                                'Timely_response?' : row[8],
                                                'Consumer_disputed': row[9],
                                        })
                print("--- Time %s in seconds for Insert Query ---" % (time.time() - start_time))
                time=time.time() - start_time
                isTableActive='true'
    except:
        consumer_complaint.delete()
        return render_template('form_submit.html',tableStatus='false')
   #consumer_complaint.delete()
    return render_template('form_submit.html',tableStatus='true')
コード例 #20
0
	def updateFromSensor(self, listab):
			tr=Table("APPosto_posti")#da correggere
			with tr.batch_write() as batch:
	
				for item in listab:
					
					
					batch.put_item(data={
					'idposto': int(item[0]),
					'extra': item[1],
					'latitudine': item[2],
					'longitudine': item[3],
					'stato' : item[1]})
					if(self.cache==True):
						dictio={'idposto': int(item[0]),
						'extra': item[1],
						'latitudine': item[2],
						'longitudine': item[3],
						'stato' : item[1]}
						self.cacheClient.setValue(str(item[0]),dictio,time=self.cexpire)
						dictio={}
コード例 #21
0
def dynamo_main(property_appliances):
    conn= dynamodb2.connect_to_region(region_name=region,aws_access_key_id=access_key,aws_secret_access_key=secret_key)
    property_table = Table(dynamodb_property_table, connection=conn)
    appliance_table = Table(dynamodb_appliance_table, connection=conn)
    status_table = Table(dynamodb_appl_status_table, connection=conn)
    table = Table(dynamodb_prop_appl_table, connection=conn)

    properties_res = property_table.scan()
    appliance_res = appliance_table.scan()
    statuses_res = status_table.scan()

    properties = []
    appliances = []
    statuses = []

    for prop in properties_res:
        properties.append(dict(prop.get_raw_keys())['propId']['S'].encode('ascii'))

    for appl in appliance_res:
        appliances.append(dict(appl.get_raw_keys())['applId']['S'])

    for status in statuses_res:
        statuses.append(dict(status.get_raw_keys())['statusId']['S'])

    with table.batch_write() as batch:
        for property_appliance in property_appliances:
            global next_id
            # Insert ids here
            property_appliance['propApplId'] = str(next_id)
            next_id+= 1
            property_appliance['propertyId'] = properties[randint(0, len(properties) - 1)]
            property_appliance['applianceId'] = str(appliances[randint(0, len(appliances) - 1)])
            property_appliance['statusId'] = str(statuses[randint(0, len(statuses) - 1)])
            property_appliance['statusHistory'] = [
                {'statusId': str(statuses[randint(0, len(statuses) - 1)]), 'dateTime':1487611362},
                {'statusId': str(statuses[randint(0, len(statuses) - 1)]), 'dateTime':1487611362},
                {'statusId': str(statuses[randint(0, len(statuses) - 1)]), 'dateTime':1487611362}
            ]
            batch.put_item(data=property_appliance)
コード例 #22
0
ファイル: dynamodb.py プロジェクト: encolpe/apetizer
class DynamoTable(object):
    
    conn = None
    table = None
    
    table_name = 'test-table'
    hash_key = 'hash_key'
    range_key = 'range_key'
    indexes = []
    
    read_units = 10
    write_units = 10
    
    counters = {'reads':0,'writes':0,'delete':0,'batch_w':0}
    
    def __init__(self, table_name, hash_key, range_key, indexes, read_units=10, write_units=10 ):
        
        self.table_name = table_name
        self.hash_key = hash_key
        self.range_key = range_key
        self.indexes = indexes
        
        self.read_units = read_units
        self.write_units = write_units
        
        try:
            self.connect()
            self.setup()
        except:
            logger.warn('Unable to connect or handle DynamoDB Table')
            traceback.print_exc()
        
    def connect(self):
        
        # create initial database tables
        self.conn = boto.dynamodb2.connect_to_region( settings.AWS_DYNAMODB_REGION,
                                                     aws_access_key_id=settings.AWS_DYNAMODB_ACCESS_KEY_ID,
                                                     aws_secret_access_key=settings.AWS_DYNAMODB_SECRET_ACCESS_KEY
                                                     )
    
    def setup(self):
        '''
        Set's up the table schema if table does not exists yet
        Return the Table
        '''
        try:
            self.table = Table.create(self.table_name, connection=self.conn, schema=[
                                                               HashKey(self.hash_key),
                                                               RangeKey(self.range_key),
                                                               ],
                                                               throughput={'read':self.read_units,'write':self.write_units})
            logger.warning('Created new DynamoDB Table')
        except:
            self.table = Table(self.table_name, connection=self.conn, schema=[
                                                               HashKey(self.hash_key),
                                                               RangeKey(self.range_key),
                                                               ],
                                                               throughput={'read':self.read_units,'write':self.write_units})
            
        return self.table
    
    
    def put(self, hash_key, range_key, data):
        '''
        puts the data to the table
        if key/range_key exists
        
        '''
        if settings.DEBUG:
            bench_start = time()
        
        data[self.hash_key] = hash_key
        data[self.range_key] = range_key
        
        item = self.table.put_item( data=data, overwrite=True )
        
        if settings.DEBUG:
            
            if not hash_key in self.counters:
                self.counters[hash_key] = {'reads':0,'writes':0}
            self.counters[hash_key]['writes'] +=1
            self.counters['writes'] +=1
            
            elapsed_time = time() - bench_start
            logger.info(data)
            logger.info("R%sW%s - write %0.5f seconds" % (self.counters[hash_key]['reads'], self.counters[hash_key]['writes'], elapsed_time))
        
        return item
        
    
    def get_latest(self, hash_key ):
        '''
        retreive the last recorded data hash_key item for the hash key
        
        '''
        if settings.DEBUG:
            bench_start = time()
        
        kwargs = {}
        kwargs[self.hash_key+'__eq'] = hash_key
        kwargs['limit'] = 1
        
        items = self.table.query( **kwargs )
        
        if items:
            data = {}
            for item in items:
                for key in item.keys():
                    if not key in (self.hash_key, self.range_key):
                        data[key] = item[key]
        else:
            return None
        
        if not len(data):
            return None
        
        if settings.DEBUG:
            
            if not hash_key in self.counters:
                self.counters[hash_key] = {'reads':0,'writes':0}
            self.counters[hash_key]['reads'] +=1
            self.counters['reads'] +=1
            elapsed_time = time() - bench_start
            
            logger.info("R%sW%s - %s - read %0.5f seconds" % (self.counters[hash_key]['reads'], self.counters[hash_key]['writes'], hash_key, elapsed_time))
            
        return data
    
    
    
    def get_range_obj(self, hash_key):
        
        if settings.DEBUG:
            bench_start = time()
        
        kwargs = {}
        kwargs[self.hash_key+'__eq'] = hash_key
        
        # TODO - use batch_get
        items = self.table.query( **kwargs )
        self.counters['reads'] +=1
        data = {}
        
        
        for item in items:
                        
            rkey_data = {}
            rkey = item[self.range_key]
            
            if rkey == 'index':
                data = json.loads(item['value'])
                break
            else:
                for key in item.keys():
                    if key != None and not key in (self.hash_key, self.range_key) and key != 'index':
                        if key == 'value':
                            value = item[key]
                            try:
                                rkey_data = json.loads(str(value))
                            except:
                                rkey_data = value
                        
                    #else:
                    #    rkey_data[key] = item[key]
                    
            data[rkey] = rkey_data
        
        if settings.DEBUG:
            
            if not hash_key in self.counters:
                self.counters[hash_key] = {'reads':0,'writes':0}
            self.counters[hash_key]['reads'] +=1
            self.counters['reads'] +=1
            
            elapsed_time = time() - bench_start
            #logger.info(data)
            logger.info("R%sW%s - %s - read %0.5f seconds" % (self.counters[hash_key]['reads'], self.counters[hash_key]['writes'], hash_key, elapsed_time))
        
        
        return data
        
    
    def set_range_obj(self, hash_key, data, range_keys=None):
        
        # avoid crashing on attempt to write None data
        if data == None:
            return
        
        if range_keys == None:
            range_keys = data.keys()
        
        # TODO
        # add better size estimate
        
        datablocks = 0
        for range_key in data.keys():
            try:
                len_size = len( data[range_key] )
            except:
                len_size = 1
            datablocks += len_size
        
        # update date in msecs since epoch
        update_date = time()
        
        if datablocks > 1000:
            
            #print hash_key,
            #print datablocks
            
            # split over multiple items by data dict key
            with self.table.batch_write() as batch:
                
                for range_key in range_keys:
                    
                    value = json.dumps( data[range_key] )
                    
                    batch_data = {}
                    batch_data[self.hash_key] = hash_key
                    batch_data[self.range_key] = range_key
                    batch_data['value'] = value
                    batch_data['update_date'] = update_date
                    
                    batch.put_item(data=batch_data)
                    
                self.counters['batch_w'] +=1
            
            # delete index if exists
            self.remove_range_obj(hash_key, range_keys=['index'])
            
        else:
            value = json.dumps(data)
            
            batch_data = {}
            batch_data[self.hash_key] = hash_key
            batch_data[self.range_key] = 'index'
            batch_data['value'] = value
            batch_data['update_date'] = update_date
            
            self.table.put_item(data=batch_data, overwrite=True)
            
        self.counters['writes'] +=1
        
        return True
        
    def remove_range_obj(self, hash_key, range_keys=None):
        '''
        deletes ranged object or specific range_keys
        '''
        
        # get range object
        if range_keys == None:
            data = self.get_range_obj(hash_key)
            range_keys = data.keys()
        
        # remove possible index
        try:
            kwargs = {}
            kwargs[self.hash_key] = hash_key
            kwargs[self.range_key] = 'index'
            
            self.table.delete_item( **kwargs )
        except:
            pass
        
        with self.table.batch_write() as batch:
            for range_key in range_keys:
                kwargs = {}
                kwargs[self.hash_key] = hash_key
                kwargs[self.range_key] = range_key
                batch.delete_item( **kwargs )
        
        self.counters['delete'] +=1
        
        return True
コード例 #23
0
ファイル: convert_to_json.py プロジェクト: olihb/cnn_analysis
def main(argv):

    # load and transform data
    keywords = load_dict(dictionary_file)
    index = load_index(date_file)

    try:
        opts, args = getopt.getopt(argv, "td")
    except getopt.GetoptError:
        sys.exit(2)

    for opt, arg in opts:

        # load tables
        if opt == '-t':
            words_date, occurrences_by_words, occurrences_by_date = process_index(linear_matrix_file, index, keywords)
            table = Table(dynamoDB_table)
            with table.batch_write() as batch:
                for word in tqdm(words_date.keys(), desc='Upload to dynamoDB', leave=True):
                    output = {}
                    data = words_date[word]


                    occurrences = []
                    dates_size = []
                    dates = []

                    for key,value in data.iteritems():
                        occurrences.append(value)
                        dates.append(key)
                        dates_size.append(occurrences_by_date[key])

                    zipped = zip(dates, occurrences, dates_size)
                    zipped.sort()
                    dates, occurrences, dates_size = zip(*zipped)

                    output['word'] = word
                    output['source'] = linear_matrix_file
                    output['occurrences'] = list(occurrences)
                    output['dates_size'] = list(dates_size)
                    output['dates'] = list(dates)
                    output['occurrences_size'] = occurrences_by_words[word]

                    batch.put_item(data=output)

        # send to database
        elif opt =='-d':

            con = lite.connect(database)
            cur = con.cursor()

            # write correspondence to db
            cur.execute("drop table if exists words_stats")
            # create tables
            cur.execute("create table words_stats(stamp date, word_id int, nb int)")

            # export to db
            lst = list()
            words_stats = process_index_doc(linear_matrix_file,index)
            for date in tqdm(words_stats.keys(), leave=True):
                stats = words_stats[date]
                for word_id in stats.keys():
                    nb_doc = stats[word_id]
                    if nb_doc>10: # otherwise, too big and doesn't bring a lot of information
                        lst.append((date,int(word_id),nb_doc))
                if len(lst)>50000:
                    cur.executemany("insert into words_stats values (?,?,?)", lst)
                    lst = list()
            cur.executemany("insert into words_stats values (?,?,?)", lst)
            con.commit()
コード例 #24
0
ファイル: db.py プロジェクト: satishkt/smart-cam
class DynamoDBUtils(object):

    # <TEST ONLY> Mapping of Customer & Pie/Cam
    cust_pie_dict = {
        "cid1" : ["cam1","cam2"],
        "cid2" : ["cam1","cam2"]
    }

    FACES = {
                 'videos/video_100_frames_1.mp4': {
                     'face_count': 59,
                     'face_count_dtl': ['0', '1', '8', '12', '12', '11', '10', '4', '1', '0'],
                     'face_count_uniq': 3,
                     'face_count_uniq_dtl': ['0', '1', '0', '0', '0', '1', '0', '0', '1', '0'],
                     'frame_count': 100,
                     'time_taken': '0:00:04.731971'
                 },
                 'videos/video_100_frames_2.mp4': {'face_count': 62,
                     'face_count_dtl': ['10', '10', '0', '0', '0', '9', '5', '8', '10', '10'],
                     'face_count_uniq': 2,
                     'face_count_uniq_dtl': ['1', '0', '0', '0', '0', '1', '0', '0', '0', '0'],
                     'frame_count': 100,
                     'time_taken': '0:00:04.955812'
                 }
            }

    rasp_names = ["kitchen", "garage"]

    cols = ['START_TIME','LEN','PROCESSED','S3_BUCKET','S3_KEY','VERSION']

    S3_BUCKET = 'smart-cam'

    def __init__(self):
        cfg = Config()
        aws_access_key_id = cfg.get("aws", "access_key_id")
        aws_secret_access_key = cfg.get("aws", "secret_access_key")
        self.conn = boto.dynamodb2.connect_to_region('us-west-1',
                                        aws_access_key_id=aws_access_key_id,
                                        aws_secret_access_key=aws_secret_access_key)
        self.sc = Table('SMARTCAM', connection=self.conn)
        logger.info(self.conn.list_tables())
        pprint.pprint(self.conn.describe_table('SMARTCAM'))

    # <TEST ONLY> Creates one item in table
    def create_items(self, num_items=2):
        cnt = 0
        for rasp_name in DynamoDBUtils.rasp_names:
            for i in xrange(num_items):
                cnt += 1
                self.__create_item(rasp_name, cnt)
                time.sleep(num_items)

    # <TEST ONLY> Creates one item in table
    def __create_item(self, rasp_name, num):
        data = dict()

        data['RASP_NAME'] = rasp_name
        data['START_TIME'] = time.time()
        data['S3_BUCKET'] = DynamoDBUtils.S3_BUCKET
        data['S3_KEY'] = 'videos/video_{0}.avi'.format(num)
        data['PROCESSED'] = 0
        data['CLASSIFIED'] = 0
        data['VERSION'] = 0

        logger.info("# Uploading Data for {0}: {1}".format(rasp_name, num))
        self.sc.put_item(data)

    # <TEST ONLY> Creates multiple full items in table
    def create_full_items(self, num_items=10, start_time=1459555200):
        cnt = 0
        with self.sc.batch_write() as batch:
            for rasp_name in DynamoDBUtils.rasp_names:
                st = start_time
                for i in xrange(num_items):
                    cnt += 1
                    if cnt % 2 == 0:
                        batch.put_item(self.__create_full_item(rasp_name, st, 'videos/video_100_frames_1.mp4'))
                    else:
                        batch.put_item(self.__create_full_item(rasp_name, st, 'videos/video_100_frames_2.mp4'))
                    st += 11.25  # 10 + 1.25 secs between 2 video files

    # <TEST ONLY> Creates multiple full items in table
    # All Hard code values for purpose of testing the Backend/UI Integration
    def __create_full_item(self, rasp_name, start_time, s3_key):
        data = dict()

        data['RASP_NAME'] = rasp_name
        data['START_TIME'] = start_time
        data['UPDATE_TIME'] = start_time + 5
        data['S3_BUCKET'] = DynamoDBUtils.S3_BUCKET
        data['S3_KEY'] = s3_key

        data['FRAME_COUNT'] = DynamoDBUtils.FACES[s3_key]['frame_count']
        data['FACE_COUNT'] = DynamoDBUtils.FACES[s3_key]['face_count']
        data['FACE_COUNT_UNIQ'] = DynamoDBUtils.FACES[s3_key]['face_count_uniq']

        # Face Counts / Detail
        d = {}
        #d['data'] = ['5','5','5','5','5','5','5','5','5','6']
        data['FACE_COUNT_DTL'] = DynamoDBUtils.FACES[s3_key]['face_count_dtl']

        d = {}
        #d['data'] = ['0','0','0','0','0','0','0','0','0','1']
        data['FACE_COUN_UNIQ_DTL'] = DynamoDBUtils.FACES[s3_key]['face_count_uniq_dtl']

        d = {}
        d['data'] = ['0.1','0.1','0.1','0.05','0.05','0.15','0.001','0.05','0.01','0.01']
        data['FOREGROUND'] = d

        data['PROCESSED'] = 1
        data['VERSION'] = 1

        logger.info("# Uploading Data for {0}: {1}".format(rasp_name, start_time))

        # Converted to a Batch Write
        #self.sc.put_item(data)

        return data

    # Creates one item in table
    def create_item(self, rasp_name, s3_bucket, s3_key, s_time):
        data = dict()

        data['RASP_NAME'] = rasp_name
        data['START_TIME'] = s_time
        data['S3_BUCKET'] = s3_bucket
        data['S3_KEY'] = s3_key
        data['PROCESSED'] = 0
        data['CLASSIFIED'] = 0
        data['VERSION'] = 0
        data['LEN'] = randint(5, 60)

        logger.info("# Uploading Data for {0}: {1}:{2}".format(rasp_name, s3_bucket, s3_key))
        self.sc.put_item(data)

    # Fetch items
    def display_items(self):
        rows = self.sc.query_2(index='PROCESSED-index',PROCESSED__eq=0)
        cnt = 0
        for row in rows:
            logger.info('{0},{1},{2}'.format(row['RASP_NAME'],row['START_TIME'],row['PROCESSED']))
            cnt += 1
        logger.info('# Total unprocessed items: {0}'.format(cnt))
        return rows

    def purge_table(self):
        cnt = 0
        for row in self.sc.scan():
            cnt += 1
            row.delete()
            logger.info('Deleted Row: {0}'.format(cnt))

    def delete_by_id(self,id):
        cnt = 0
        for row in self.get_items_by_id(id):
            cnt += 1
            row.delete()
            logger.info('Deleted Row: {0}'.format(cnt))

    def reset_processed(self):
        cnt = 0
        for row in self.sc.scan():
            cnt += 1
            row['PROCESSED'] = 0
            self.update(row)
            logger.info('Update Row: {0}'.format(cnt))

    def reset_classified(self):
        cnt = 0
        for row in self.sc.scan():
            cnt += 1
            row['CLASSIFIED'] = 0
            self.update(row)
            logger.info('Update Row: {0}'.format(cnt))

    def add_classified(self):
        cnt = 0
        for row in self.sc.scan():
            cnt += 1
            row['CLASSIFIED'] = 0
            self.update(row)
            logger.info('Update Row: {0}'.format(cnt))

    def get_unprocessed_items(self):
        return self.sc.query_2(index='PROCESSED-index',PROCESSED__eq=0)

    def get_processed_items(self):
        return self.sc.query_2(index='PROCESSED-index',PROCESSED__eq=1)

    def get_unclassified_items(self):
        return self.sc.query_2(index='CLASSIFIED-index',CLASSIFIED__eq=0)

    def get_classified_items(self):
        return self.sc.query_2(index='CLASSIFIED-index',CLASSIFIED__eq=1)

    def get_items_by_id(self, id):
        return self.sc.query_2(RASP_NAME__eq=id)

    def get_items_by_id_range(self, id, start, end):
        return self.sc.query_2(RASP_NAME__eq=id, START_TIME__between=(start, end))

    def update(self, row):
        try:
            row.save(overwrite=True)
        except Exception as e:
            logger.error(e)
            logger.info('[FAILED] Processing: ', row['RASP_NAME'],row['START_TIME'],row['PROCESSED'])

    def stats(self,lst):
        quotient, remainder = divmod(len(lst), 2)
        if remainder:
            return sorted(lst)[quotient]
        return sum(lst) / len(lst), sum(sorted(lst)[quotient - 1:quotient + 1]) / 2

    def close(self):
        self.conn.close()
コード例 #25
0
ファイル: s3mper.py プロジェクト: RogerBai/s3mper
class S3mper:
    """ S3mper is a metastore library used to provide a layer of consistency on 
        top of S3 by using dynamodb to record what files should be in the S3
        listing.
        
        See go/s3mper for more information.
    """

    def __init__(self, disabled=False, fail_on_error=False, table_name="ConsistentListingMetastoreTest"):
        self.disabled = disabled
        self.disable_alerts = False
        self.fail_on_error = fail_on_error

        if self.disabled:
            logger.warning("S3mper Explicitly Disabled")

            return

        self.db = Table(table_name)

    def add(self, paths):
        """ Adds a list of Paths to the file metastore and returns True on success. 
        
            Example:
            s.add([path1, path2]) -> True
        """
        if self.disabled:
            return

        epoch = self.__time_now()

        paths = self.__as_paths(paths)

        with self.db.batch_write() as batch:
            for path in paths:
                batch.put_item(data={"path": path.parent().normalize(), "file": path.filename(), "epoch": epoch})

    def list(self, path, include_delete_marked=False):
        """ Lists the given directory in the metastore.  The passed in path must be a directory.
        
            Example: 
            s.list(path) -> []
        """
        if self.disabled:
            return

        if isinstance(path, basestring):
            path = Path(path)

        listing = self.db.query(path__eq=path.normalize(), consistent=True)

        paths = []

        for e in listing:
            if (not include_delete_marked) and "deleted" in e:
                continue

            paths.append(Path("s3n:" + e["path"] + "/" + e["file"]))

        return paths

    def checked_listing(self, s3_listing, path):
        """ Checks the s3_listing against the metastore listing.  All attempts
            are made to use the boto generator for listing if a check isn't
            necessary, but if a check must be made the whole listing for both
            the metastore and s3 listing need to be pulled into memory.
        """
        if self.disabled:
            return s3_listing

        expected = set([p.url for p in self.list(path)])

        if not expected:
            return s3_listing

        # This isn't ideal since we are sucking in the whole listing
        # to perform the check, but if we check on-the-fly, processing
        # could be partially complete before inconsistency is detected
        s3_listing = list(s3_listing())

        for p in s3_listing:
            expected.discard(p if not isinstance(p, Key) else "s3://%s/%s" % (p.bucket, p.name))

        if not expected:
            return s3_listing
        else:
            logger.error(
                "Failed consistency check.  Missing file count %d. Missing paths: %s" % (len(expected), expected)
            )
            self.__send_alert(expected)

            if self.fail_on_error:
                raise S3ConsistencyException(expected)

    def delete(self, paths, delete_marker=False):
        """ Deletes the provided paths from the metastore.  
        
            Completly removing files from the metastore can cause problems 
            because the s3 listing may show the files even though the data may 
            not be available.  This will cause MR jobs to fail.  The delete marker
            can be used to hide files from the listing.
        
            Example:
            s.delete([path1, path2]) -> True
        """
        if self.disabled:
            return

        paths = self.__as_paths(paths)

        if delete_marker:
            for path in paths:
                item = self.db.get_item(path=path.parent().normalize(), file=path.filename())
                item["deleted"] = "true"
        else:
            with self.db.batch_write() as batch:
                for path in paths:
                    batch.delete_item(path=path.parent().normalize(), file=path.filename())

    def __send_alert(self, paths, detail={}):
        if self.disable_alerts:
            return

        try:
            body = {
                "truncated": detail.get("truncated", False),
                "paths": paths if len(paths) <= 10 else paths[0:9],
                "recovered": detail.get("recovered", False),
                "missingFiles": len(paths),
                "stackTrace": traceback.extract_stack(),
                "timestamp": "%s" % datetime.utcnow(),
                "queryId": detail.get("", None),
                "taskId": detail.get("", None),
                "hostname": platform.node(),
                "username": getpass.getuser(),
                "queryType": "DSE Platform Lib",
                "jobId": detail.get("jobId", None),
                "attemptId": detail.get("attemptId", None),
                "email": detail.get("email", None),
                "dataovenId": detail.get("dataovenId", None),
                "logFile": detail.get("logFile", None),
                "inputFile": detail.get("inputFile", None),
                "genieId": detail.get("genieId", None),
                "epoch": self.__time_now(),
            }

            message = RawMessage()
            message.set_body(body)

            conn = sqs.connect_to_region("us-east-1")
            queue = conn.get_queue("s3mper-alert-queue")

            queue.write(message)

        except Exception as e:
            print e

    def __as_paths(self, paths):
        if isinstance(paths, basestring):
            return [Path(paths)]
        elif isinstance(paths, Path):
            return [paths]
        else:
            return paths

    def __time_now(self):
        """ Returns current time in milliseconds. """
        return int(time.time())
コード例 #26
0
class DynamoDBAdapter(key_value_store.KeyValueStore):
    """ Implementation of an abstract key-value store defined in
    key_value_store.py. The underlying database is amazon DynamoDB.

    The store keeps all objects in a single table with following schema:
    [HashKey('kind', data_type=STRING), RangeKey('id')]. 'kind' is the string
    with the object type ('vector', 'set' or 'int') and 'id' is the object id.
    The object value is stored in the 'value' attribute of the table items.

    The table should be created before this code is executed. Amazon
    configuration is assumed to be stored in ~/.boto file as described in
    http://boto.readthedocs.org/en/latest/boto_config_tut.html
    """
    def __init__(self, precision=np.dtype('float32'), table_name='test'):
        """ Create an instance of the dynamodb key-value store.
        precision - a numpy type, elements of all vectors are converted and
           stored in this type;
        table_name - the name of the DynamoDB table which keeps the objects.
        """
        conn = boto.dynamodb2.connect_to_region('eu-west-1')
        if not isinstance(precision, np.dtype):
            raise TypeError("Precision should be a numpy.dtype subtype")
        self.precision = precision
        self.precision_name = precision.name
        self.table = Table(table_name, connection=conn)

    def _get_or_create_item(self, kind, item_id):
        try:
            item = self.table.get_item(kind=kind, id=item_id)
        except ItemNotFound:
            item = Item(self.table)
            item['kind'] = kind
            item['id'] = item_id
        return item

    def _create_vector_item(self, vec_id, vector):
        item = self._get_or_create_item('vector', vec_id)
        item['value'] = Binary(vector.astype(self.precision).tostring())
        item['precision'] = self.precision_name
        return item

    def _vector_value(self, item):
        return np.fromstring(str(item['value']), np.dtype(item['precision']))

    def get_vector_ids(self):
        return [v['id'] for v in self.table.query_2(kind__eq='vector')]

    def get_int_ids(self):
        return [v['id'] for v in self.table.query_2(kind__eq='int')]

    def get_set_ids(self):
        return [v['id'] for v in self.table.query_2(kind__eq='set')]

    def store_vector(self, vec_id, vector):
        item = self._create_vector_item(vec_id, vector)
        item.save()

    def get_vector(self, vec_id):
        try:
            item = self.table.get_item(kind='vector', id=vec_id)
        except ItemNotFound:
            raise KeyError('Vector key %s is unknown' % (vec_id, ))
        return self._vector_value(item)

    def bulk_get_vector(self, vec_ids):
        keys = [{'kind': 'vector', 'id': i} for i in vec_ids]
        vs = self.table.batch_get(keys=keys)
        return [self._vector_value(i) for i in vs]

    def remove_vector(self, vec_id):
        try:
            item = self.table.get_item(kind='vector', id=vec_id)
        except ItemNotFound:
            raise KeyError('Vector key %s is unknown' % (vec_id, ))
        item.delete()

    def add_to_set(self, set_id, element_id):
        item = self._get_or_create_item('set', set_id)
        if 'value' not in item.keys() or not isinstance(item['value'], set):
            item['value'] = set()
        item['value'].add(element_id)
        item.save(overwrite=True)

    def remove_from_set(self, set_id, element_id):
        try:
            item = self.table.get_item(kind='set', id=set_id)
        except ItemNotFound:
            raise KeyError('Set key %s is unknown' % (set_id, ))
        if 'value' not in item.keys() or not isinstance(item['value'], set):
            raise KeyError('Incorrect value in item %s' % (set_id, ))
        if element_id not in item['value']:
            raise KeyError('Element %s not in set %s' % (element_id, set_id))
        item['value'].remove(element_id)
        item.save()

    def remove_set(self, set_id):
        try:
            item = self.table.get_item(kind='set', id=set_id)
            item.delete()
        except ItemNotFound:
            raise KeyError('Set key %s is unknown' % (set_id, ))

    def get_set(self, set_id):
        try:
            the_set = self.table.get_item(kind='set', id=set_id)['value']
            return set([str(entry) for entry in the_set])
        except ItemNotFound:
            raise KeyError('Set key %s is unknown' % (set_id, ))

    def store_int(self, int_id, integer):
        item = self._get_or_create_item('int', int_id)
        item['value'] = integer
        item.save()

    def get_int(self, int_id):
        try:
            return int(self.table.get_item(kind='int', id=int_id)['value'])
        except ItemNotFound:
            raise KeyError('Int key %s is unknown' % (int_id, ))

    def remove_int(self, int_id):
        try:
            item = self.table.get_item(kind='int', id=int_id)
        except ItemNotFound:
            raise KeyError('Int key %s is unknown' % (int_id, ))
        item.delete()

    def _aggregate_set_id_element_pairs(self, setpairs):
        """Turns a list of pairs of the form (set_id, element_id) into a list 'L' of
        pairs 'p' of the form (set_id, set_of_element_ids). 'L' has the property
        that if 'p' and 'q' are distinct entries in 'L', then p[0] and q[0] are
        also distinct."""
        set_ids = set([entry[0] for entry in setpairs])
        listlist = [[entry for entry in setpairs if entry[0] == set_id]
                    for set_id in set_ids]
        result = [(pairlist[0][0], set([entry[1] for entry in pairlist]))
                  for pairlist in listlist]
        return result

    def bulk_store_vector(self, vec_ids, vectors):
        if len(vec_ids) != len(vectors):
            raise ValueError
        vecpairs = zip(vec_ids, vectors)
        with self.table.batch_write() as batch:
            for vec_id, vec in vecpairs:
                item = self._create_vector_item(vec_id, vec)
                batch.put_item(item)

    def bulk_store_vector_old(self, vectors_df):
        """Argument 'vectors' is a dataframe with index vector ids."""
        if len(vec_ids) != len(vectors):
            raise ValueError
        with self.table.batch_write() as batch:
            for ind in vectors_df.index:
                vec_id = str(ind)
                vec = vectors_df.loc[ind].values
                item = self._create_vector_item(vec_id, vec)
                batch.put_item(item)

    def bulk_store_int(self, int_ids, integers):
        """Argument 'intpairs' is a list of pairs of the form (int_id, integer)."""
        if len(int_ids) != len(integers):
            raise ValueError
        intpairs = zip(int_ids, integers)
        with self.table.batch_write() as batch:
            for pair in intpairs:
                int_id, integer = pair
                item = self._get_or_create_item('int', int_id)
                item['value'] = integer
                batch.put_item(item)

    def bulk_add_to_set(self, set_ids, element_ids):
        """batch_write() objects if the same item is written to more
        than once per batch, hence we aggregate all (set_id, element_id)
        pairs into a list of pairs (set_id, element_ids), where
        the 'set_id's are pairwise distinct, and the 'element_ids'
        are sets."""
        if len(set_ids) != len(element_ids):
            raise ValueError
        setpairs = zip(set_ids, element_ids)
        setlist = self._aggregate_set_id_element_pairs(setpairs)
        with self.table.batch_write() as batch:
            for pair in setlist:
                set_id, element_ids = pair
                item = self._get_or_create_item('set', set_id)
                if 'value' not in item.keys() or not isinstance(
                        item['value'], set):
                    item['value'] = set()
                item['value'].update(element_ids)
                batch.put_item(item)
コード例 #27
0
def roundDecimal(flt):
    return int(round(Decimal(flt), 7) * 10000000)


row = rows.next()

tableRowCount = 1

#round to 6 decimal points so we can save to dynamodb (known bug)

while row != None:
    #write 25 rows at the time, saves thoroughput and improves performance
    batchIndex = 0

    try:
        with tdidfIndexTable.batch_write() as batch:
            print "starting new batch"
            while (batchIndex != 25):

                if (row == None):
                    break

                #calculate Tf-idf
                tdIdfValue = tdIdfCalculator.Calculate(row[columnWithBody])
                articleId = row[columnWithUniqueId]

                #No need to add entry for tf-idf if keyword does not appear in the source at all.
                if tdIdfValue > 0:
                    data = {
                        'id': uniqueId,
                        'word': keyword,
コード例 #28
0
ファイル: aws_data.py プロジェクト: ChaseSnapshot/smcity
class AwsDataFactory(DataFactory):

    def __init__(self, config):
        '''
        Constructor.

        @param config Configuration settings. Expected definition:

        Section: database
        Key:     data_table
        Type:    string
        Desc:    Name of the Data model table
        @paramType ConfigParser
        @returns n/a
        '''
        self.global_table = Table(config.get('database', 'global_data_table'))
        self.set_table = Table(config.get('database', 'set_data_table'))

    def create_data(self, content, datum_id, location, set_id, timestamp, type):
        ''' {@inheritDocs} '''
        assert content is not None
        assert datum_id is not None
        assert -180 <= location[0] and location[0] < 180, location[0]
        assert -90 <= location[1] and location[1] < 90, location[1]
        assert set_id is not None
        assert timestamp is not None
        assert type is not None
        
        # Normalize the values
        lat_norm       = int(location[1] * 10000000)
        lon_norm       = int(location[0] * 10000000)
        timestamp_norm = strftime('%Y-%m-%d %H:%M:%S', timestamp)

        # Create the database record
        data = {
            'content' : content,
            'datum_id' : datum_id,
            'lat' : lat_norm,
            'lat_copy' : lat_norm,
            'lon' : lon_norm,
            'lon_copy' : lon_norm,
            'set_id' : set_id,
            'timestamp' : timestamp_norm,
            'timestamp_copy' : timestamp_norm,
            'type' : type
        }

        result = False 
        if set_id == 'global': # If this is a global data point
            result = self.global_table.put_item(data=data)
        else: # If this is a set data point
            result = self.set_table.put_item(data=data)

        # If we failed to create the database record
        if result is False:
            raise CreateError("Failed to create the Data(" + str(data) + ")!")
        
    def copy_data(self, set_id, datas):
        ''' {@inheritDocs} '''
        assert set_id is not None

        with self.set_table.batch_write() as batch:
            for data in datas:
                batch.put_item(data = {
                    'content' : data.get_content(), 
                    'datum_id' : data.get_datum_id(),
                    'lat' : data.record['lat'],
                    'lat_copy' : data.record['lat_copy'],
                    'lon' : data.record['lon'],
                    'lon_copy' : data.record['lon_copy'],
                    'set_id' : set_id,
                    'timestamp' : data.record['timestamp'],
                    'timestamp_copy' : data.record['timestamp_copy'],
                    'type' : data.record['type']
                })

    def filter_global_data(self, min_timestamp=None, max_timestamp=None,
                                 min_lat=None, max_lat=None,
                                 min_lon=None, max_lon=None,
                                 segment_id=0, num_segments=1,
                                 type=None
                                 ):
        ''' {@inheritDocs} '''
        kwargs = {}
        if min_timestamp is not None:
            kwargs['timestamp__gte'] = strftime('%Y-%m-%d %H:%M:%S', min_timestamp)
        if max_timestamp is not None:
            kwargs['timestamp_copy__lte'] = strftime('%Y-%m-%d %H:%M:%S', max_timestamp)
        if min_lat is not None:
            kwargs['lat__gte'] = int(min_lat * 10000000)
        if max_lat is not None:
            kwargs['lat_copy__lte'] = int(max_lat * 10000000)
        if min_lon is not None:
            kwargs['lon__gte'] = int(min_lon * 10000000)
        if max_lon is not None:
            kwargs['lon_copy__lte'] = int(max_lon * 10000000)
        if type is not None: 
            kwargs['type__eq'] = type
        kwargs['set_id__eq'] = 'global'
        kwargs['segment'] = segment_id
        kwargs['total_segments'] = num_segments

        logger.debug("Scan Args: %s", kwargs)

        return AwsDataIterator(self.global_table.scan(**kwargs))

    def get_data_set(self, set_id):
        ''' {@inheritDocs} '''
        return AwsDataIterator(self.set_table.query(set_id__eq=set_id))
コード例 #29
0
ファイル: ingestAWS.py プロジェクト: alanarangof/survey
# check if the file is present
"""
    c.execute('''CREATE TABLE users
             (email text, token text)''')
    c.execute('''CREATE TABLE result
             (token text, q1 text, q2 text, q3 text)''')
"""

conn = boto.dynamodb2.connect_to_region(
        'us-west-1',
        aws_access_key_id=AWS_ACCESS_KEY_ID,
        aws_secret_access_key=AWS_SECRET_ACCESS_KEY
    )

users = Table('survey2_users', connection=conn)

unique_emails = set()
with users.batch_write() as batch:
    with open(csvfile) as csvfile:
        csvreader = csv.reader(csvfile, delimiter=',', quotechar='"')
        for line in csvreader:
            if line[0] not in unique_emails:
                batch.put_item(data={'email':line[0], 
                                 'token': str(uuid.uuid4()),
                                 })
                unique_emails.add(line[0])
            else:
                print "DUPE: %s"%line[0]

コード例 #30
0
ファイル: ddb_runs.py プロジェクト: yz-/ut
class DDBRuns(Dynamo):

    @classmethod
    def from_test_mode(cls, access_key=None, secret=None):
        """
        Use this for getting an instance of this class that uses test tables.
        """
        instance = cls(access_key, secret)
        instance.table = Table('test_runs', connection=instance.connection)
        return instance

    def __init__(self, access_key=None, secret=None):
        """
        When called directly (as should be done for production code), sets table to the production 'runs' table.
        """
        super(DDBRuns, self).__init__(access_key, secret)
        self.table = Table('runs', connection=self.connection)

    def save_new_run(self, dt_str=None, start_date_str=None, end_date_str=None):
        """
        dt_str = datetime of run. Defaults to now.
        start_date_str = the start date for look-back of query performance data processing. * No default
        end_date_str = the end date for query performance data processing. Defaults to today.
        """
        assert start_date_str, "start_date_str is required when saving a new run to runs table."
        assert DAY_STR_RE.match(start_date_str)
        if end_date_str:
            assert DAY_STR_RE.match(end_date_str)
        if dt_str:
            assert SECOND_STR_RE.match(dt_str)

        dt_str = dt_str or datetime.now().strftime(SECOND_STR_FORMAT)
        end_date_str = end_date_str or datetime.now().strftime(DAY_STR_FORMAT)
        return self.table.put_item(data={'dt': dt_str, 'start': start_date_str, 'end': end_date_str})

    def most_recent_start_date_str(self):
        """
        :return: a string representing most recent start date from db
        """
        df = self.get_runs_df()
        if df.empty:
            return None
        else:
            # should already be sorted, but just in case...
            df.sort(columns=['dt'], ascending=True, inplace=True)
            return df.iloc[len(df)-1]['start']

    def most_recent_end_date_str(self):
        """
        :return: a string representing most recent end date from db
        """
        df = self.get_runs_df()
        if df.empty:
            return None
        else:
            # should already be sorted, but just in case...
            df.sort(columns=['dt'], ascending=True, inplace=True)
            return df.iloc[len(df)-1]['end']

    def get_runs_df(self):
        """
        Returns all table as dataframe, sorted with most recent entry on bottom (ascending order)
        """
        df = DataFrame([{k: v for k, v in r.items()} for r in self.table.scan()])
        if df.empty:
            return df
        else:
            df.sort(columns=['dt'], ascending=True, inplace=True)
            # force df to have columns in this order
            return df[['dt', 'start', 'end']]

    def modify_throughput(self, requested_read, requested_write, table=None):
        table = table or self.table
        return super(DDBRuns, self).modify_throughput(requested_read, requested_write, table)

    def truncate_table(self):
        """
        WARNING! Only use for test mode table
        """
        assert self.table.table_name == 'test_runs', "Will only truncate test table. To truncate production table, run code manually"
        with self.table.batch_write() as batch:
            for item in self.table.scan():
                batch.delete_item(dt=item['dt'])


    def thors_start_end_date_strings(self, new_run=True, days_ago_start=30):
        if new_run:
            if days_ago_start is not None:
                print days_ago_start
                start_date_str = self._days_ago_str(days_ago_start)
            else:
                start_date_str = self.most_recent_end_date_str()
            end_date_str = date.today().strftime(DAY_STR_FORMAT)
        else:
            start_date_str = self.most_recent_start_date_str()
            end_date_str = self.most_recent_end_date_str()
            assert start_date_str, "Start date string is None, please check the database since we are not doing a new run"
            assert end_date_str, "End date string is None, please check the database since we are not doing a new run"
        return start_date_str, end_date_str

    def _days_ago_str(self, num_days_ago):
        return (date.today() - timedelta(days=num_days_ago)).strftime(DAY_STR_FORMAT)

    def start_end_date_strings(self, new_run=True, days_ago_start=30):
        if new_run:
            start_date_str = self.most_recent_end_date_str() or self._days_ago_str(days_ago_start)
            end_date_str = date.today().strftime(DAY_STR_FORMAT)
        else:
            start_date_str = self.most_recent_start_date_str()
            end_date_str = self.most_recent_end_date_str()
        return start_date_str, end_date_str
コード例 #31
0
ファイル: load_topics.py プロジェクト: olihb/cnn_analysis
def dump_to_dynamo(cur, dynamoDB_table, json_output_file):
    # get occurrences by date
    print "get occurrences by date"
    sql = """   select stamp, count(*)
                from dates
                group by stamp"""
    cur.execute(sql)
    total_occ = {}
    dates = cur.fetchall()
    for date in dates:
        total_occ[date[0]]=date[1]

    # get occurences by topic
    print "get occurences by topic"
    sql = """   select m.topic, stamp, count(*) n
                from (
                    select id, topic, cast(max(occ) as real) occu,(select cast(sum(k.occ) as real) from document k where d.id=k.id) t
                    from document d
                    group by id
                ) m
                join dates da on m.id=da.id
                where occu/t>0.5
                group by m.topic, stamp
                order by m.topic, stamp asc"""
    cur.execute(sql)
    occurrences = cur.fetchall()
    topics = defaultdict(lambda: defaultdict(lambda:[]))


    print "iterate over result set"
    for row in tqdm(occurrences, leave=True):
        topic = row[0]
        stamp = row[1]
        occ = row[2]
        topics[topic]['occurrences'].append(int(occ))
        topics[topic]['dates'].append(str(stamp))
        topics[topic]['dates_size'].append(int(total_occ[stamp]))


    print "push to dynamodb table"
    table = Table(dynamoDB_table)
    with table.batch_write() as batch:
        for topic in tqdm(topics.keys(), leave=True):
            output = {}

            output['word'] = "key_topic_"+str(topic)
            output['source'] = 'load_db_topics'
            output['occurrences'] = topics[topic]["occurrences"]
            output['dates_size'] = topics[topic]["dates_size"]
            output['dates'] = topics[topic]["dates"]
            output['occurrences_size']=99

            batch.put_item(data=output)

    # output json description file
    print "output json description file"

    # get topics --because sqlite doesn't support row numbers
    topics = []
    sql = "select distinct topic_id from model"
    cur.execute(sql)
    rows = cur.fetchall()
    for row in rows:
        topics.append(row[0])

    # iterate over sql to build json
    output = []
    for index, topic in tqdm(enumerate(topics)):
        sql = "select word from model m join dictionary d on d.id=m.word_id where topic_id=? order by m.occ desc limit 10"
        cur.execute(sql,(topic,))
        rows = cur.fetchall()
        words = []
        for word in rows:
            words.append(word[0])
        line = dict()
        line['id']=index
        line['name']=", ".join(words)
        line['key']="key_topic_"+str(topic)
        output.append(line)

    # output to file
    with open(json_output_file, 'w') as outfile:
        json.dump(output,outfile)
コード例 #32
0
class S3mper:
    """ S3mper is a metastore library used to provide a layer of consistency on 
        top of S3 by using dynamodb to record what files should be in the S3
        listing.
        
        See go/s3mper for more information.
    """
    def __init__(self,
                 disabled=False,
                 fail_on_error=False,
                 table_name='ConsistentListingMetastoreTest'):
        self.disabled = disabled
        self.disable_alerts = False
        self.fail_on_error = fail_on_error

        if self.disabled:
            logger.warning('S3mper Explicitly Disabled')

            return

        self.db = Table(table_name)

    def add(self, paths):
        """ Adds a list of Paths to the file metastore and returns True on success. 
        
            Example:
            s.add([path1, path2]) -> True
        """
        if self.disabled:
            return

        epoch = self.__time_now()

        paths = self.__as_paths(paths)

        with self.db.batch_write() as batch:
            for path in paths:
                batch.put_item(
                    data={
                        'path': path.parent().normalize(),
                        'file': path.filename(),
                        'epoch': epoch
                    })

    def list(self, path, include_delete_marked=False):
        """ Lists the given directory in the metastore.  The passed in path must be a directory.
        
            Example: 
            s.list(path) -> []
        """
        if self.disabled:
            return

        if isinstance(path, basestring):
            path = Path(path)

        listing = self.db.query(path__eq=path.normalize(), consistent=True)

        paths = []

        for e in listing:
            if (not include_delete_marked) and 'deleted' in e:
                continue

            paths.append(Path('s3n:' + e['path'] + "/" + e['file']))

        return paths

    def checked_listing(self, s3_listing, path):
        """ Checks the s3_listing against the metastore listing.  All attempts
            are made to use the boto generator for listing if a check isn't
            necessary, but if a check must be made the whole listing for both
            the metastore and s3 listing need to be pulled into memory.
        """
        if self.disabled:
            return s3_listing

        expected = set([p.url for p in self.list(path)])

        if not expected:
            return s3_listing

        #This isn't ideal since we are sucking in the whole listing
        #to perform the check, but if we check on-the-fly, processing
        #could be partially complete before inconsistency is detected
        s3_listing = list(s3_listing())

        for p in s3_listing:
            expected.discard(p if not isinstance(p, Key) else 's3://%s/%s' %
                             (p.bucket, p.name))

        if not expected:
            return s3_listing
        else:
            logger.error(
                "Failed consistency check.  Missing file count %d. Missing paths: %s"
                % (len(expected), expected))
            self.__send_alert(expected)

            if self.fail_on_error:
                raise S3ConsistencyException(expected)

    def delete(self, paths, delete_marker=False):
        """ Deletes the provided paths from the metastore.  
        
            Completly removing files from the metastore can cause problems 
            because the s3 listing may show the files even though the data may 
            not be available.  This will cause MR jobs to fail.  The delete marker
            can be used to hide files from the listing.
        
            Example:
            s.delete([path1, path2]) -> True
        """
        if (self.disabled):
            return

        paths = self.__as_paths(paths)

        if delete_marker:
            for path in paths:
                item = self.db.get_item(path=path.parent().normalize(),
                                        file=path.filename())
                item['deleted'] = "true"
        else:
            with self.db.batch_write() as batch:
                for path in paths:
                    batch.delete_item(path=path.parent().normalize(),
                                      file=path.filename())

    def __send_alert(self, paths, detail={}):
        if self.disable_alerts:
            return

        try:
            body = {
                "truncated": detail.get('truncated', False),
                "paths": paths if len(paths) <= 10 else paths[0:9],
                "recovered": detail.get('recovered', False),
                "missingFiles": len(paths),
                "stackTrace": traceback.extract_stack(),
                "timestamp": "%s" % datetime.utcnow(),
                "queryId": detail.get('', None),
                "taskId": detail.get('', None),
                "hostname": platform.node(),
                "username": getpass.getuser(),
                "queryType": "DSE Platform Lib",
                "jobId": detail.get('jobId', None),
                "attemptId": detail.get('attemptId', None),
                "email": detail.get('email', None),
                "dataovenId": detail.get('dataovenId', None),
                "logFile": detail.get('logFile', None),
                "inputFile": detail.get('inputFile', None),
                "genieId": detail.get('genieId', None),
                "epoch": self.__time_now()
            }

            message = RawMessage()
            message.set_body(body)

            conn = sqs.connect_to_region("us-east-1")
            queue = conn.get_queue('s3mper-alert-queue')

            queue.write(message)

        except Exception as e:
            print e

    def __as_paths(self, paths):
        if isinstance(paths, basestring):
            return [Path(paths)]
        elif isinstance(paths, Path):
            return [paths]
        else:
            return paths

    def __time_now(self):
        """ Returns current time in milliseconds. """
        return int(time.time())
コード例 #33
0
class ContentStore:
    """
    DynamoDb proxy for wikidata store, which contains page ids, titles, and contents.

    This class is designed to be used in a "with" block, as in
    with ContentStore() as store:
        ... statements using store ...

    You must have a configuration file in your home directory containing the AWS access key id
    and secret key for the IAM identity that has access to the database. This file must be named
    ".boto", and must contain a section of the form:

    [Credentials]
    aws_access_key_id = <access key id>
    aws_secret_access_key = <secret key id>
    """

    def add_page(self, pageId, pageTitle, pageText):
        """
        Adds a new page to the data store.

        New pages are batched and sent to DynamoDB 25 at a time. Clients may call flush() to
        cause all pending pages to be uploaded immediately.
        :param str pageId: The unique identifier of the page
        :param str pageTitle: The title of the page
        :param str pageText: The text contents of the page, in wiki markup format
        """
        self._batch_write.put_item(data={
            'pageId': pageId,
            'pageTitle': pageTitle,
            'pageText': pageText})

    def flush(self):
        """
        Ensure that all added pages are persisted to DynamoDB.
        """
        self._batch_write.flush()

    def get_content(self, pageId):
        """
        Retrieve the content of a page
        :param str pageId: The unique page identifier
        :return: The text content of a page in wiki markup format
        """
        item = self._table.get_item(pageId=pageId)
        return item['pageText']

    def get_title(self, pageId):
        """
        Get the title of a page
        :param str pageId: The unique page identifier
        :return: The title content of a page
        """
        item = self._table.get_item(pageId=pageId)
        return item['pageTitle']

    def __enter__(self):
        self._previous_access_key_id = os.environ["AWS_ACCESS_KEY_ID"]
        self._previous_secret_key = os.environ["AWS_SECRET_ACCESS_KEY"]
        del os.environ["AWS_ACCESS_KEY_ID"]
        del os.environ["AWS_SECRET_ACCESS_KEY"]
        connection = boto.dynamodb2.connect_to_region('us-east-1')
        self._table = Table('wikidata', connection=connection)
        self._batch_write = self._table.batch_write()
        return self

    def __exit__(self, exc_type, exc_value, traceback):
        os.environ["AWS_ACCESS_KEY_ID"] = self._previous_access_key_id
        os.environ["AWS_SECRET_ACCESS_KEY"] = self._previous_secret_key
        if self._batch_write.should_flush():
            self._batch_write.flush()

        # Cause the exception to be re-raised if one has occurred
        return exc_value is None
コード例 #34
0
ファイル: ddb_slurps.py プロジェクト: yz-/ut
class DDBSlurps(Dynamo):

    @classmethod
    def from_test_mode(cls, access_key=None, secret=None):
        """
        Use this for getting an instance of this class that uses test tables.
        """
        instance = cls(access_key, secret)
        instance.slurps_table = Table('test_slurps', connection=instance.connection)
        instance.failed_slurps_table = Table('test_failed_slurps', connection=instance.connection)
        return instance

    def __init__(self, access_key=None, secret=None):
        """
        ! Use test_mode factory method for instantiating this class with test_slurps and test_failed_slurps tables
        """
        super(DDBSlurps, self).__init__(access_key, secret)

        self.slurps_table = Table('slurps', connection=self.connection)
        self.failed_slurps_table = Table('failed_slurps', connection=self.connection)

    def save_slurp_info(self, slurp_info_, overwrite=True):
        """
        slurp_info_ can either be in the form of a list of dicts or else a single dict.
        If slurp_info is a list, batch write will be used
        """
        if isinstance(slurp_info_, dict):
            self.slurps_table.put_item(slurp_info_, overwrite=overwrite)
        elif isinstance(slurp_info_, list):
            with self.slurps_table.batch_write() as batch:
                for s in slurp_info_:
                    batch.put_item(data=s, overwrite=overwrite)
        else:
            raise TypeError, "slurp_info must be a dict or a list of dicts, not a {}".format(type(slurp_info_))

    def save_failed_slurp(self, searchterm):
        self.failed_slurps_table.put_item(data={'searchterm': searchterm, 'datetime': datetime.now().isoformat()},
                                          overwrite=True)

    def get_slurp_info(self, search_term_=None):
        """
        search_term_ can be either a string or a list of strings. Each string should be a search term you are looking
        for in the db.
        Returns either a single list of key-value tuples (if search_term_ was a string)
        or a list of key-value tuples (if search_term_ was a list)
        Each list of key-value tuples can easily be converted to a dict or an OrderedDict by the client.
        """

        # searchterm_ is a STRING
        if isinstance(search_term_, basestring):
            if search_term_:
                slurp_info = (self.slurps_table.get_item(searchterm=search_term_)).items()
            else:
                slurp_info = []

        # searchterm is a LIST of strings
        elif isinstance(search_term_, list):
            if search_term_:
                # create a set of non-empty searchterms. We us a set to avoid a duplicate query error from the db
                set_of_sts = {st for st in search_term_ if st}
                # create a list of dicts from the set
                list_of_st_dicts = [{'searchterm': st} for st in set_of_sts]
                res = self.slurps_table.batch_get(list_of_st_dicts)
                try:
                    slurp_info = [i.items() for i in res]
                except (StopIteration, IndexError):
                    # If res is empty, we get one of these errors when trying to iterate.
                    slurp_info = []
            else:
                slurp_info = []

        # searchterm is an unexpected type
        else:
            raise TypeError, "search_term_ must be a dict or a list of dicts, not a {}".format(type(search_term_))

        return slurp_info

    def existing_and_missing_uni(self, searchterm_list):
        """
        Takes a list of searchterm strings and returns a list of searchterm strings that were found in the db (in unicode)
        and a list of the searchterms that were missing from the found results
        """
        # make sure in utf8 before we send request to the db
        input_sts_utf8 = [to_utf8_or_bust(i) for i in searchterm_list]
        found_sts_info = self.get_slurp_info(input_sts_utf8)
        found_sts_uni = [to_unicode_or_bust(dict(i)['searchterm']) for i in found_sts_info]
        input_sts_uni = [to_unicode_or_bust(i) for i in input_sts_utf8]
        missing_sts_uni = order_conserving.setdiff(input_sts_uni, found_sts_uni)
        return found_sts_uni, missing_sts_uni

    def get_table(self, table_name):
        """
        Convenience method for client who may wish to get a specific table from the DynamoDB connection
        """
        return Table(table_name, connection=self.connection)

    def truncate_failed_slurp_table(self):
        """
        """
        with self.failed_slurps_table.batch_write() as batch:
            for item in self.failed_slurps_table.scan():
                batch.delete_item(searchterm=item['searchterm'])

    def truncate_slurp_table(self):
        """
        WARNING! Only use for test mode table
        """
        assert self.slurps_table.table_name == 'test_slurps', "Will only truncate test slurps table. To truncate production table, run code manually"
        test_slurps_table = Table('test_slurps', connection=self.connection)
        with test_slurps_table.batch_write() as batch:
            for item in self.slurps_table.scan():
                batch.delete_item(searchterm=item['searchterm'])

    def modify_failed_slurps_throughput(self, requested_read, requested_write):
        return self.modify_throughput(requested_read, requested_write, self.failed_slurps_table)

    def modify_slurps_throughput(self, requested_read, requested_write):
        return self.modify_throughput(requested_read, requested_write, self.slurps_table)

    def get_slurps_table_info(self):
        return self.get_table_info(self.slurps_table)

    def get_failed_slurps_table_info(self):
        return self.get_table_info(self.failed_slurps_table)
コード例 #35
0
pending_acc_ids = [7, 8]
property_acc_ids = [9, 10]

conn= dynamodb2.connect_to_region(region_name=region,aws_access_key_id=access_key,aws_secret_access_key=secret_key)

property_table = Table(dynamodb_property_table, connection=conn)
pend_tnt_table = Table(dynamodb_pending_tenant_table, connection=conn)
prop_tnt_table = Table(dynamodb_property_tenant_table, connection=conn)

properties = []

for prop in properties_res:
    properties.append(dict(prop.get_raw_keys())['id']['S'].encode('ascii'))

with pend_tnt_table.batch_write() as batch:
    for pend_acc_id in pending_acc_ids:
        # Insert ids here
        pending = {}
        pending['pendTntid'] = str(pend_next_id)
        pend_next_id+= 1
        pending['propertyId'] = properties[randint(0, len(properties) - 1)]
        pending['status'] = pending_types[randint(0, len(pending_types) - 1)]
        pending['accountId'] = pend_acc_id
        batch.put_item(data=pending)

with prop_tnt_table.batch_write() as batch:
    for property_acc_id in property_acc_ids
        prop = {}
        prop['propTntId'] = str(prop_next_id)
        prop_next_id+= 1
コード例 #36
0
def add_sample_stops():
	# try:
	c = Coords()
	tb_stops = Table('stops', connection=cm.db)
	tb_stops_loc = Table('stops_loc', connection=cm.db)
	with tb_stops.batch_write() as batch:
		batch.put_item(data={
			'stop_id': 'alpy',
			'name_part': 'a',
			'name'	 : 'aleppey',
			'level_2': 'alappuzha',
			'level_1': 'kerala',
			'country': 'india'
		})
		batch.put_item(data={
			'stop_id': 'kykm',
			'name_part': 'k',
			'name'	 : 'kayankulam',
			'level_2': 'alappuzha',
			'level_1': 'kerala',
			'country': 'india'
		})
		batch.put_item(data={
			'stop_id': 'ochr',
			'name_part': 'o',
			'name'	 : 'oachira',
			'level_2': 'kollam',
			'level_1': 'kerala',
			'country': 'india'
		})
		batch.put_item(data={
			'stop_id': 'vlkv',
			'name_part': 'v',
			'name'	 : 'vallikavu',
			'level_2': 'kollam',
			'level_1': 'kerala',
			'country': 'india'
		})
		batch.put_item(data={
			'stop_id': 'kpy',
			'name_part': 'k',
			'name'	 : 'karunagapally',
			'level_2': 'kollam',
			'level_1': 'kerala',
			'country': 'india'
		})
		batch.put_item(data={
			'stop_id': 'klm',
			'name_part': 'k',
			'name'	 : 'kollam',
			'level_2': 'kollam',
			'level_1': 'kerala',
			'country': "india"
		})

	# Adding to location
	with tb_stops_loc.batch_write() as batch:
		batch.put_item(data={
			'stop_id': 'alpy',
			'lat_part': "9",
			'lat': c.integerify(9.5010367),
			'lon': c.integerify(76.3421059),
			'lon_part': '76'
		})
		batch.put_item(data={
			'stop_id': 'kykm',
			'lat_part':"9",
			'lon_part': '76',
			'lat': c.integerify(9.1729609),
			'lon': c.integerify(76.5073299)
			
		})
		batch.put_item(data={
			'stop_id': 'ochr',
			'lat_part':"9",
			'lon_part': '76',
			'lat': c.integerify(9.1272739),
			'lon': c.integerify(76.5065333)
			
		})
		batch.put_item(data={
			'stop_id': 'vlkv',
			'lat': c.integerify(9.0938471),
			'lon': c.integerify(76.4916068),
			'lat_part':"9",
			'lon_part': '76'
		})
		batch.put_item(data={
			'stop_id': 'kpy',
			'lat_part':"9",
			'lon_part': '76',
			'lat': c.integerify(9.0609902),
			'lon': c.integerify(76.5341999)
			
		})
		batch.put_item(data={
			'stop_id': 'klm',
			'lat_part':"8",
			'lon_part': '76',
			'lat': c.integerify(8.8862714),
			'lon': c.integerify(76.5938379)
			
		})
	return message_helper.success()
コード例 #37
0
ファイル: fillTDIDFValues.py プロジェクト: motasem-salem/MIDS
#boto dynamo2 bug won't let us python float, thus rounding by 7 decimal points and multiple by 10^7
def roundDecimal(flt):
	return int(round(Decimal(flt),7) * 10000000)

row = rows.next()

tableRowCount = 1

#round to 6 decimal points so we can save to dynamodb (known bug)

while row != None:
	#write 25 rows at the time, saves thoroughput and improves performance
	batchIndex = 0

	try: 
		with tdidfIndexTable.batch_write() as batch:
			print "starting new batch"
			while (batchIndex != 25):	
	
				if (row == None):
					break
				
				#calculate Tf-idf
				tdIdfValue = tdIdfCalculator.Calculate(row[columnWithBody])
				articleId = row[columnWithUniqueId]
	
				#No need to add entry for tf-idf if keyword does not appear in the source at all. 
				if tdIdfValue > 0:
					data={
					     'id': uniqueId,
					     'word': keyword,
コード例 #38
0
ファイル: dynamo_script.py プロジェクト: MattBubernak/Team_4
    connection=conn
)

#Input json file name
js = sys.argv[2]

#Loading data in Tables
with open(js) as json_file :
          data = json.load(json_file)
          size = len(data)
          i = 0
          printProgress(i, size, prefix = 'Data', suffix = 'Complete', barLength = 50)
          for date in data:
                topics_list = []
                for index,topic in enumerate(data[date]) :
                   topics_list.append(str(data[date][topic]["topic_id"]) + '#' + topic + '#' + data[date][topic]['category'] + '#' + ''.join(reversed(date.split('-'))) + '#' + str(data[date][topic]["score"]))
                   if len(topics_list) == 10 or index == len(data[date]) - 1 :
                     try:
                        with topics.batch_write() as batch:
                            for item in topics_list:
                                items = item.split('#')
                                batch.put_item(data={'Name' : items[1] , 'Category' : items[2] ,'Date' : sys.argv[1] + items[3] ,'Score': Decimal(items[4])})
                                #print items
                            topics_list = []
                        sleep(0.1)
                     except :
                            print sys.exc_info()[0], items

                printProgress(i, size, prefix = 'Data', suffix = 'Complete', barLength = 50)
                i += 1
コード例 #39
0
class ContentStore:
    """
    DynamoDb proxy for wikidata store, which contains page ids, titles, and contents.

    This class is designed to be used in a "with" block, as in
    with ContentStore() as store:
        ... statements using store ...

    You must have a configuration file in your home directory containing the AWS access key id
    and secret key for the IAM identity that has access to the database. This file must be named
    ".boto", and must contain a section of the form:

    [Credentials]
    aws_access_key_id = <access key id>
    aws_secret_access_key = <secret key id>
    """
    def add_page(self, pageId, pageTitle, pageText):
        """
        Adds a new page to the data store.

        New pages are batched and sent to DynamoDB 25 at a time. Clients may call flush() to
        cause all pending pages to be uploaded immediately.
        :param str pageId: The unique identifier of the page
        :param str pageTitle: The title of the page
        :param str pageText: The text contents of the page, in wiki markup format
        """
        self._batch_write.put_item(data={
            'pageId': pageId,
            'pageTitle': pageTitle,
            'pageText': pageText
        })

    def flush(self):
        """
        Ensure that all added pages are persisted to DynamoDB.
        """
        self._batch_write.flush()

    def get_content(self, pageId):
        """
        Retrieve the content of a page
        :param str pageId: The unique page identifier
        :return: The text content of a page in wiki markup format
        """
        item = self._table.get_item(pageId=pageId)
        return item['pageText']

    def get_title(self, pageId):
        """
        Get the title of a page
        :param str pageId: The unique page identifier
        :return: The title content of a page
        """
        item = self._table.get_item(pageId=pageId)
        return item['pageTitle']

    def __enter__(self):
        self._previous_access_key_id = os.environ["AWS_ACCESS_KEY_ID"]
        self._previous_secret_key = os.environ["AWS_SECRET_ACCESS_KEY"]
        del os.environ["AWS_ACCESS_KEY_ID"]
        del os.environ["AWS_SECRET_ACCESS_KEY"]
        connection = boto.dynamodb2.connect_to_region('us-east-1')
        self._table = Table('wikidata', connection=connection)
        self._batch_write = self._table.batch_write()
        return self

    def __exit__(self, exc_type, exc_value, traceback):
        os.environ["AWS_ACCESS_KEY_ID"] = self._previous_access_key_id
        os.environ["AWS_SECRET_ACCESS_KEY"] = self._previous_secret_key
        if self._batch_write.should_flush():
            self._batch_write.flush()

        # Cause the exception to be re-raised if one has occurred
        return exc_value is None
コード例 #40
0
class DynamoDBUtils(object):

    # <TEST ONLY> Mapping of Customer & Pie/Cam
    cust_pie_dict = {"cid1": ["cam1", "cam2"], "cid2": ["cam1", "cam2"]}

    FACES = {
        'videos/video_100_frames_1.mp4': {
            'face_count':
            59,
            'face_count_dtl':
            ['0', '1', '8', '12', '12', '11', '10', '4', '1', '0'],
            'face_count_uniq':
            3,
            'face_count_uniq_dtl':
            ['0', '1', '0', '0', '0', '1', '0', '0', '1', '0'],
            'frame_count':
            100,
            'time_taken':
            '0:00:04.731971'
        },
        'videos/video_100_frames_2.mp4': {
            'face_count':
            62,
            'face_count_dtl':
            ['10', '10', '0', '0', '0', '9', '5', '8', '10', '10'],
            'face_count_uniq':
            2,
            'face_count_uniq_dtl':
            ['1', '0', '0', '0', '0', '1', '0', '0', '0', '0'],
            'frame_count':
            100,
            'time_taken':
            '0:00:04.955812'
        }
    }

    rasp_names = ["kitchen", "garage"]

    cols = ['START_TIME', 'LEN', 'PROCESSED', 'S3_BUCKET', 'S3_KEY', 'VERSION']

    S3_BUCKET = 'smart-cam'

    def __init__(self):
        cfg = Config()
        aws_access_key_id = cfg.get("aws", "access_key_id")
        aws_secret_access_key = cfg.get("aws", "secret_access_key")
        self.conn = boto.dynamodb2.connect_to_region(
            'us-west-1',
            aws_access_key_id=aws_access_key_id,
            aws_secret_access_key=aws_secret_access_key)
        self.sc = Table('SMARTCAM', connection=self.conn)
        logger.info(self.conn.list_tables())
        pprint.pprint(self.conn.describe_table('SMARTCAM'))

    # <TEST ONLY> Creates one item in table
    def create_items(self, num_items=2):
        cnt = 0
        for rasp_name in DynamoDBUtils.rasp_names:
            for i in xrange(num_items):
                cnt += 1
                self.__create_item(rasp_name, cnt)
                time.sleep(num_items)

    # <TEST ONLY> Creates one item in table
    def __create_item(self, rasp_name, num):
        data = dict()

        data['RASP_NAME'] = rasp_name
        data['START_TIME'] = time.time()
        data['S3_BUCKET'] = DynamoDBUtils.S3_BUCKET
        data['S3_KEY'] = 'videos/video_{0}.avi'.format(num)
        data['PROCESSED'] = 0
        data['CLASSIFIED'] = 0
        data['VERSION'] = 0

        logger.info("# Uploading Data for {0}: {1}".format(rasp_name, num))
        self.sc.put_item(data)

    # <TEST ONLY> Creates multiple full items in table
    def create_full_items(self, num_items=10, start_time=1459555200):
        cnt = 0
        with self.sc.batch_write() as batch:
            for rasp_name in DynamoDBUtils.rasp_names:
                st = start_time
                for i in xrange(num_items):
                    cnt += 1
                    if cnt % 2 == 0:
                        batch.put_item(
                            self.__create_full_item(
                                rasp_name, st,
                                'videos/video_100_frames_1.mp4'))
                    else:
                        batch.put_item(
                            self.__create_full_item(
                                rasp_name, st,
                                'videos/video_100_frames_2.mp4'))
                    st += 11.25  # 10 + 1.25 secs between 2 video files

    # <TEST ONLY> Creates multiple full items in table
    # All Hard code values for purpose of testing the Backend/UI Integration
    def __create_full_item(self, rasp_name, start_time, s3_key):
        data = dict()

        data['RASP_NAME'] = rasp_name
        data['START_TIME'] = start_time
        data['UPDATE_TIME'] = start_time + 5
        data['S3_BUCKET'] = DynamoDBUtils.S3_BUCKET
        data['S3_KEY'] = s3_key

        data['FRAME_COUNT'] = DynamoDBUtils.FACES[s3_key]['frame_count']
        data['FACE_COUNT'] = DynamoDBUtils.FACES[s3_key]['face_count']
        data['FACE_COUNT_UNIQ'] = DynamoDBUtils.FACES[s3_key][
            'face_count_uniq']

        # Face Counts / Detail
        d = {}
        #d['data'] = ['5','5','5','5','5','5','5','5','5','6']
        data['FACE_COUNT_DTL'] = DynamoDBUtils.FACES[s3_key]['face_count_dtl']

        d = {}
        #d['data'] = ['0','0','0','0','0','0','0','0','0','1']
        data['FACE_COUN_UNIQ_DTL'] = DynamoDBUtils.FACES[s3_key][
            'face_count_uniq_dtl']

        d = {}
        d['data'] = [
            '0.1', '0.1', '0.1', '0.05', '0.05', '0.15', '0.001', '0.05',
            '0.01', '0.01'
        ]
        data['FOREGROUND'] = d

        data['PROCESSED'] = 1
        data['VERSION'] = 1

        logger.info("# Uploading Data for {0}: {1}".format(
            rasp_name, start_time))

        # Converted to a Batch Write
        #self.sc.put_item(data)

        return data

    # Creates one item in table
    def create_item(self, rasp_name, s3_bucket, s3_key, s_time):
        data = dict()

        data['RASP_NAME'] = rasp_name
        data['START_TIME'] = s_time
        data['S3_BUCKET'] = s3_bucket
        data['S3_KEY'] = s3_key
        data['PROCESSED'] = 0
        data['CLASSIFIED'] = 0
        data['VERSION'] = 0
        data['LEN'] = randint(5, 60)

        logger.info("# Uploading Data for {0}: {1}:{2}".format(
            rasp_name, s3_bucket, s3_key))
        self.sc.put_item(data)

    # Fetch items
    def display_items(self):
        rows = self.sc.query_2(index='PROCESSED-index', PROCESSED__eq=0)
        cnt = 0
        for row in rows:
            logger.info('{0},{1},{2}'.format(row['RASP_NAME'],
                                             row['START_TIME'],
                                             row['PROCESSED']))
            cnt += 1
        logger.info('# Total unprocessed items: {0}'.format(cnt))
        return rows

    def purge_table(self):
        cnt = 0
        for row in self.sc.scan():
            cnt += 1
            row.delete()
            logger.info('Deleted Row: {0}'.format(cnt))

    def delete_by_id(self, id):
        cnt = 0
        for row in self.get_items_by_id(id):
            cnt += 1
            row.delete()
            logger.info('Deleted Row: {0}'.format(cnt))

    def reset_processed(self):
        cnt = 0
        for row in self.sc.scan():
            cnt += 1
            row['PROCESSED'] = 0
            self.update(row)
            logger.info('Update Row: {0}'.format(cnt))

    def reset_classified(self):
        cnt = 0
        for row in self.sc.scan():
            cnt += 1
            row['CLASSIFIED'] = 0
            self.update(row)
            logger.info('Update Row: {0}'.format(cnt))

    def add_classified(self):
        cnt = 0
        for row in self.sc.scan():
            cnt += 1
            row['CLASSIFIED'] = 0
            self.update(row)
            logger.info('Update Row: {0}'.format(cnt))

    def get_unprocessed_items(self):
        return self.sc.query_2(index='PROCESSED-index', PROCESSED__eq=0)

    def get_processed_items(self):
        return self.sc.query_2(index='PROCESSED-index', PROCESSED__eq=1)

    def get_unclassified_items(self):
        return self.sc.query_2(index='CLASSIFIED-index', CLASSIFIED__eq=0)

    def get_classified_items(self):
        return self.sc.query_2(index='CLASSIFIED-index', CLASSIFIED__eq=1)

    def get_items_by_id(self, id):
        return self.sc.query_2(RASP_NAME__eq=id)

    def get_items_by_id_range(self, id, start, end):
        return self.sc.query_2(RASP_NAME__eq=id,
                               START_TIME__between=(start, end))

    def update(self, row):
        try:
            row.save(overwrite=True)
        except Exception as e:
            logger.error(e)
            logger.info('[FAILED] Processing: ', row['RASP_NAME'],
                        row['START_TIME'], row['PROCESSED'])

    def stats(self, lst):
        quotient, remainder = divmod(len(lst), 2)
        if remainder:
            return sorted(lst)[quotient]
        return sum(lst) / len(lst), sum(
            sorted(lst)[quotient - 1:quotient + 1]) / 2

    def close(self):
        self.conn.close()
コード例 #41
0
class DDBRuns(Dynamo):
    @classmethod
    def from_test_mode(cls, access_key=None, secret=None):
        """
        Use this for getting an instance of this class that uses test tables.
        """
        instance = cls(access_key, secret)
        instance.table = Table('test_runs', connection=instance.connection)
        return instance

    def __init__(self, access_key=None, secret=None):
        """
        When called directly (as should be done for production code), sets table to the production 'runs' table.
        """
        super(DDBRuns, self).__init__(access_key, secret)
        self.table = Table('runs', connection=self.connection)

    def save_new_run(self,
                     dt_str=None,
                     start_date_str=None,
                     end_date_str=None):
        """
        dt_str = datetime of run. Defaults to now.
        start_date_str = the start date for look-back of query performance data processing. * No default
        end_date_str = the end date for query performance data processing. Defaults to today.
        """
        assert start_date_str, "start_date_str is required when saving a new run to runs table."
        assert DAY_STR_RE.match(start_date_str)
        if end_date_str:
            assert DAY_STR_RE.match(end_date_str)
        if dt_str:
            assert SECOND_STR_RE.match(dt_str)

        dt_str = dt_str or datetime.now().strftime(SECOND_STR_FORMAT)
        end_date_str = end_date_str or datetime.now().strftime(DAY_STR_FORMAT)
        return self.table.put_item(data={
            'dt': dt_str,
            'start': start_date_str,
            'end': end_date_str
        })

    def most_recent_start_date_str(self):
        """
        :return: a string representing most recent start date from db
        """
        df = self.get_runs_df()
        if df.empty:
            return None
        else:
            # should already be sorted, but just in case...
            df.sort(columns=['dt'], ascending=True, inplace=True)
            return df.iloc[len(df) - 1]['start']

    def most_recent_end_date_str(self):
        """
        :return: a string representing most recent end date from db
        """
        df = self.get_runs_df()
        if df.empty:
            return None
        else:
            # should already be sorted, but just in case...
            df.sort(columns=['dt'], ascending=True, inplace=True)
            return df.iloc[len(df) - 1]['end']

    def get_runs_df(self):
        """
        Returns all table as dataframe, sorted with most recent entry on bottom (ascending order)
        """
        df = DataFrame([{k: v
                         for k, v in r.items()} for r in self.table.scan()])
        if df.empty:
            return df
        else:
            df.sort(columns=['dt'], ascending=True, inplace=True)
            # force df to have columns in this order
            return df[['dt', 'start', 'end']]

    def modify_throughput(self, requested_read, requested_write, table=None):
        table = table or self.table
        return super(DDBRuns, self).modify_throughput(requested_read,
                                                      requested_write, table)

    def truncate_table(self):
        """
        WARNING! Only use for test mode table
        """
        assert self.table.table_name == 'test_runs', "Will only truncate test table. To truncate production table, run code manually"
        with self.table.batch_write() as batch:
            for item in self.table.scan():
                batch.delete_item(dt=item['dt'])

    def thors_start_end_date_strings(self, new_run=True, days_ago_start=30):
        if new_run:
            if days_ago_start is not None:
                print days_ago_start
                start_date_str = self._days_ago_str(days_ago_start)
            else:
                start_date_str = self.most_recent_end_date_str()
            end_date_str = date.today().strftime(DAY_STR_FORMAT)
        else:
            start_date_str = self.most_recent_start_date_str()
            end_date_str = self.most_recent_end_date_str()
            assert start_date_str, "Start date string is None, please check the database since we are not doing a new run"
            assert end_date_str, "End date string is None, please check the database since we are not doing a new run"
        return start_date_str, end_date_str

    def _days_ago_str(self, num_days_ago):
        return (date.today() -
                timedelta(days=num_days_ago)).strftime(DAY_STR_FORMAT)

    def start_end_date_strings(self, new_run=True, days_ago_start=30):
        if new_run:
            start_date_str = self.most_recent_end_date_str(
            ) or self._days_ago_str(days_ago_start)
            end_date_str = date.today().strftime(DAY_STR_FORMAT)
        else:
            start_date_str = self.most_recent_start_date_str()
            end_date_str = self.most_recent_end_date_str()
        return start_date_str, end_date_str
コード例 #42
0
class Index(Base):
    '''An Index for docker-registry that uses Amazon AWS DynamoDB as the storage engine.
    
    Boto is used to do all access to DynamoDB.
    
    Configure the following dynamodb_config variables or environment variables:
    
    dynamodb_index_database - optional, if not specified will default to 'docker-registry'
        and the repository and version table names will be constructed using the
        {dynamodb_index_database}-repository and {dynamodb_index_database}-version.
        DynamoDB does not have a database concept, just tables in the data store.
    
    dynamodb_index_repository_table - override the default table name (above) with a new name
    
    dynamodb_index_version_table - override the default table name with a new name
    
    dynamodb_region - the AWS region for the dynamodb. This will default to the s3_region and if
        that is not defined, it defaults to 'us-east-1'.
    
    dynamodb_access_key - the AWS access key to use
    
    dynamodb_secret_access_key - the AWS secret part of the access key
    '''
    
    _initLock = Lock()

    def __init__(self, database=None, dynamodb_access_key=None, dynamodb_secret_access_key=None):
        '''
        Constructor
        '''
        cfg = dynamodb_config.load()
        if database is None:
            database = cfg['extensions.dynamodb_index.database']
        if dynamodb_access_key is None:
            dynamodb_access_key = cfg['extensions.dynamodb_index.access_key']
        if dynamodb_secret_access_key is None:
            dynamodb_secret_access_key = cfg['extensions.dynamodb_index.secret_access_key']
        
        self.repositoryTableName = cfg['extensions.dynamodb_index.repository_table']
        self.versionTableName = cfg['extensions.dynamodb_index.version_table']
        
        if dynamodb_access_key is None:
            self._db = dynamodb2.connect_to_region(cfg['extensions.dynamodb_index.region'])
        else:
            self._db = dynamodb2.connect_to_region(cfg['extensions.dynamodb_index.region'],
                                                   aws_access_key_id=dynamodb_access_key,
                                                   aws_secret_access_key=dynamodb_secret_access_key)
        
        self._repositoryTable = Table(self.repositoryTableName,
                                     schema=[HashKey('name', data_type=STRING)],
                                     global_indexes=[GlobalAllIndex('Repositories-By-Description-Index',
                                                                    parts=[HashKey('description', data_type=STRING)])],
                                     connection=self._db)
        self._versionTable = Table(self.versionTableName,
                                  schema=[HashKey('version', data_type=NUMBER)],
                                  connection=self._db)

        self.version = 1
        Index._initLock.acquire()
        try:
            self._setup_database()
        finally:
            Index._initLock.release()
        super(Index, self).__init__()
    
    def _describe_or_create_tables(self):
        dynamodb_util.create_table_if_not_exists(self._repositoryTable)
        dynamodb_util.create_table_if_not_exists(self._versionTable)

            
    def _wait_for_tables(self):
        dynamodb_util.wait_for_table_active(self._repositoryTable)
        dynamodb_util.wait_for_table_active(self._versionTable)
    
    def _read_or_set_schema_version(self, default_version):
        def read_schema_version():
            v = 0
            try:
                results = self._versionTable.scan()
                row = results.next()
                v = row['version']
            except:
                v = -1
            return v

        # Read or insert the schema_version. Keep doing it until one
        # of them works. This is in case another thread is attempting the same
        # thing. Reading first will allow this thread to complete.
        schemaVersion = read_schema_version()
        while (schemaVersion <= 0):
            try:
                self._versionTable.put_item(data={'version': default_version})
                schemaVersion = default_version
            except:
                sleep(0.5)
                schemaVersion = read_schema_version()
                
        return schemaVersion
    
    
    def _setup_database(self):
        needs_index = not dynamodb_util.table_exists(self._versionTable)
        self._describe_or_create_tables()
        self._wait_for_tables()
        
        version = self._read_or_set_schema_version(self.version)
        if (version != self.version):
            raise NotImplementedError('unrecognized search index version {0}'.format(version))
        if needs_index:
            self._generate_index()
        
    def _generate_index(self):
        store = storage.load()
        with self._repositoryTable.batch_write() as batch:
            for repository in self._walk_storage(store=store):
                logger.info('Populating repository: {0}'.format(repository['name']))
                batch.put_item(data=repository)
        
    def _handle_repository_created(
            self, sender, namespace, repository, value):
        name = '{0}/{1}'.format(namespace, repository)
        description = ''  # TODO(wking): store descriptions
        logger.info('Creating new repository {0}'.format(name))
        self._repositoryTable.put_item(data={'name': name, 'description': description})

    def _handle_repository_updated(
            self, sender, namespace, repository, value):
        name = '{0}/{1}'.format(namespace, repository)
        description = ''  # TODO(wking): store descriptions
        logger.info('Updating repository {0}'.format(name))
        repo = self._repositoryTable.get_item(name=name)
        repo['description'] = description
        repo.save(overwrite=True)
        

    def _handle_repository_deleted(self, sender, namespace, repository):
        name = '{0}/{1}'.format(namespace, repository)
        logger.info('Deleting repository {0}'.format(name))
        self._repositoryTable.delete_item(name=name)

    def results(self, search_term=None):
        """Return a list of results matching search_term

        The list elements should be dictionaries:

          {'name': name, 'description': description}
        """
        if not search_term or len(search_term) == 0:
            logger.info('Index query: full table scan')
            repositories = self._repositoryTable.scan()
        else:
            logger.info('Index query: {0}'.format(search_term))
            repositories = self._repositoryTable.scan(conditional_operator='OR',
                                                      name__contains=search_term,
                                                      description__contains=search_term)

        if repositories:
            return [{'name': repo['name'],
                     'description': repo['description'],
                     } for repo in repositories]
       
        return []
コード例 #43
0
###########################
## Batch writing
###########################

# If you’re loading a lot of data at a time, making use of batch writing can both speed up the process &
# reduce the number of write requests made to the service.

# Batch writing involves wrapping the calls you want batched in a context manager. The context manager
# imitates the Table.put_item & Table.delete_item APIs. Getting & using the context manager looks like:

import time
from boto.dynamodb2.table import Table
tweets = Table('tweets')

with tweets.batch_write() as batch:
    batch.put_item(
        data={
            'id': '1111',
            'username': '******',
            'screen_name': 'yyyy',
            'tweet': 'yes yes',
        })
    batch.put_item(
        data={
            'id': '2222',
            'username': '******',
            'screen_name': 'dddd',
            'tweet': 'no no',
        })
コード例 #44
0
def main():
    aud = sys.argv[1]
    f = open(aud)
    region = 'us-east-1'
    #region = 'ap-southeast-1'
    print 'Connecting to %s with IAM role' % (region)
    #conn = boto.dynamodb.connect_to_region(region)

    #    table = conn.get_table('users1')
    table = Table('users1')
    skipped = 0
    newcnt = 0
    updatedcnt = 0
    samecnt = 0
    cnt = 0
    batchcnt = 0
    errcnt = 0
    batch = None

    for line in f:

        if not batch:
            batch = table.batch_write()
            print "Got batch %s" % batch

        (cookie, segs) = line.split("\t")
        if " " in cookie:
            cookie = cookie.replace(" ", "+")
        if not cookie.endswith("=="):
            cookie = cookie + "=="
        try:
            # print "Decoding %s" % cookie
            cdec = base64.b64decode(cookie)
            s = struct.unpack("<IIII", cdec)
            uid = "%08X%08X%08X%08X" % s
        except:
            errcnt += 1
            continue

        # print "%s -> %s" % (cookie, uid)
        seg_list = segs.split(",")
        seg_list = ['%s:tp:1' % s for s in seg_list]
        try:
            item = table.get_item(key=uid)
            json = item['doAttr']
            e = simplejson.loads(json)
            if not e:
                newcnt += 1
                e = []
        except boto.dynamodb2.exceptions.ItemNotFound:
            newcnt += 1
            item = {'dtAttr': 'java.util.Set', 'doAttr': '[]'}
            e = []
        # e - existing
        e = [s.replace(':fp:', ':tp:').strip() for s in e]
        e = sets.Set(e)
        # n - new
        n = sets.Set(seg_list)
        # combine
        n.update(e)
        # if the same no need to write
        if n == e:
            samecnt += 1
            skipped += 1
            continue
        elif e:
            updatedcnt += 1
        n = list(n)

        item['doAttr'] = simplejson.dumps(n)
        #print "Putting %s" % item
        batchcnt += 1
        batch.put_item(data={
            'doAttr': item['doAttr'],
            'dtAttr': 'java.util.Set',
            'key': uid
        })

        #item.put()
        cnt += 1
        if cnt % BATCH_SIZE == 0:
            batch.flush()
            batch = None
        if cnt % 5000 == 0:
            print "OK"
            print item
            print "User count: %s total, updated %s, same %s, new %s, error %s" % (
                cnt, updatedcnt, samecnt, newcnt, errcnt)
            print "Wrote %s users" % cnt
            item2 = table.new_item(
                key='LAST_WRITE',
                attrs={
                    'dtAttr':
                    'java.lang.String',
                    'doAttr':
                    "LOTAME: User count: %s total, updated %s, same %s, new %s, error %s at %s\nLast user: %s : %s"
                    % (cnt, updatedcnt, samecnt, newcnt, errcnt,
                       datetime.datetime.now(), user, str(item))
                })
            print item2
            item2.put()
    batch.flush()
    item2 = table.new_item({
        key:
        'LAST_WRITE',
        'dtAttr':
        'java.util.String',
        'doAttr':
        "LOTAME: User count: %s total, updated %s, same %s, new %s, error %s at %s"
        % (cnt, updatedcnt, samecnt, newcnt, errcnt, datetime.datetime.now()),
        'item':
        item
    })
    print item2
    item2.put()
    print "Added or updated %s users, skipped %s, to %s region" % (
        cnt, skipped, region)
コード例 #45
0
ファイル: dbconn.py プロジェクト: clach04/AO3rdr-backend
class DBconn(object):
    def __init__(self):
        aws_access_key_id = os.environ['S3_KEY']  # I AM OPS U NO GET MY KEYS
        aws_secret_access_key = os.environ['S3_SECRET']  # DIS IS MY JOB

        self._conn = DynamoDBConnection(
            aws_access_key_id=aws_access_key_id, 
            aws_secret_access_key=aws_secret_access_key)
        self.works_table = Table('ao3rdr-works', connection=self._conn)
        self.immutable_fields = ['work_id', 'user_id']

    def get_user(self, user_id):
        res = self.works_table.query_2(
            user_id__eq=user_id, work_id__eq='settings', attributes=['user_id'])
        out = []
        for entry in res:
            out.append(self.serialize(entry)['user_id'])
        return out

    def add_user(self, user_id):
        """ Adding a user adds a special "work" which is used to store a user's
            settings.
        """
        return self.works_table.put_item(data={
            'user_id': user_id,
            'work_id': 'settings',
            'created': time.time()
        })

    def update_work(self, user_id, work_id, data):
        item = self.works_table.get_item(user_id=user_id, work_id=work_id)
        # update the item
        for key, value in data.iteritems():
            if key not in self.immutable_fields:
                item[key] = value
        item['updated'] = time.time()
        item.partial_save()

    def create_work(self, user_id, work_id, data):
        data['user_id'] = user_id
        data['work_id'] =  work_id
        self.works_table.put_item(data)

    def batch_update(self, data_list):
        with self.works_table.batch_write() as batch:
            for data in data_list:
                batch.put_item(data=data)

    def get_work(self, user_id, work_id):
        try:
            res = self.works_table.get_item(user_id=user_id, work_id=work_id)
        except ItemNotFound:
            return {}
        return self.serialize(res)

    def get_all_works(self, user_id):
        res = self.works_table.query_2(user_id__eq=user_id)
        for entry in res:
            yield self.serialize(entry)

    def close(self):
        self._conn.close()

    def serialize(self, item):
        out = serialize(dict(item))
        return out
コード例 #46
0
		filer = codecs.open(directory +"/nyt-" + dayStr + ".json", "rb", encoding="utf-8")

	except:
	        logging.debug("could not open file" + filename)
		exit()
	
        data = json.loads(filer.read())

	filer.close()


	for dayArticle in data: #for each day
			if (dayArticle["response"] != None):
				#articles are bundled in group of 10 (because that is how much article per request NYT API was returning. 
	                        # will batch write for better performance and thoroughput saving. 
				with nyt.batch_write() as batch:

					for doc in dayArticle["response"]['docs']:
						logging.debug("processing articleId " + doc['_id'] + " published at " + doc['pub_date'])
		
						#must replace all empty string b/c otherwise dynamodb complains
						doc = replaceEmptyString2(doc)

				
						convertedDate = dateutil.parser.parse(doc['pub_date'])
		
						#saving datetime as timestamp
						timestamp = time.mktime((convertedDate.year, convertedDate.month, convertedDate.day,convertedDate.hour, convertedDate.minute, convertedDate.second,-1, -1, -1)) + convertedDate.microsecond / 1e6

						data = {
							"id"     : doc['_id'],
コード例 #47
0
class DDBSlurps(Dynamo):
    @classmethod
    def from_test_mode(cls, access_key=None, secret=None):
        """
        Use this for getting an instance of this class that uses test tables.
        """
        instance = cls(access_key, secret)
        instance.slurps_table = Table('test_slurps',
                                      connection=instance.connection)
        instance.failed_slurps_table = Table('test_failed_slurps',
                                             connection=instance.connection)
        return instance

    def __init__(self, access_key=None, secret=None):
        """
        ! Use test_mode factory method for instantiating this class with test_slurps and test_failed_slurps tables
        """
        super(DDBSlurps, self).__init__(access_key, secret)

        self.slurps_table = Table('slurps', connection=self.connection)
        self.failed_slurps_table = Table('failed_slurps',
                                         connection=self.connection)

    def save_slurp_info(self, slurp_info_, overwrite=True):
        """
        slurp_info_ can either be in the form of a list of dicts or else a single dict.
        If slurp_info is a list, batch write will be used
        """
        if isinstance(slurp_info_, dict):
            self.slurps_table.put_item(slurp_info_, overwrite=overwrite)
        elif isinstance(slurp_info_, list):
            with self.slurps_table.batch_write() as batch:
                for s in slurp_info_:
                    batch.put_item(data=s, overwrite=overwrite)
        else:
            raise TypeError, "slurp_info must be a dict or a list of dicts, not a {}".format(
                type(slurp_info_))

    def save_failed_slurp(self, searchterm):
        self.failed_slurps_table.put_item(data={
            'searchterm':
            searchterm,
            'datetime':
            datetime.now().isoformat()
        },
                                          overwrite=True)

    def get_slurp_info(self, search_term_=None):
        """
        search_term_ can be either a string or a list of strings. Each string should be a search term you are looking
        for in the db.
        Returns either a single list of key-value tuples (if search_term_ was a string)
        or a list of key-value tuples (if search_term_ was a list)
        Each list of key-value tuples can easily be converted to a dict or an OrderedDict by the client.
        """

        # searchterm_ is a STRING
        if isinstance(search_term_, basestring):
            if search_term_:
                slurp_info = (self.slurps_table.get_item(
                    searchterm=search_term_)).items()
            else:
                slurp_info = []

        # searchterm is a LIST of strings
        elif isinstance(search_term_, list):
            if search_term_:
                # create a set of non-empty searchterms. We us a set to avoid a duplicate query error from the db
                set_of_sts = {st for st in search_term_ if st}
                # create a list of dicts from the set
                list_of_st_dicts = [{'searchterm': st} for st in set_of_sts]
                res = self.slurps_table.batch_get(list_of_st_dicts)
                try:
                    slurp_info = [i.items() for i in res]
                except (StopIteration, IndexError):
                    # If res is empty, we get one of these errors when trying to iterate.
                    slurp_info = []
            else:
                slurp_info = []

        # searchterm is an unexpected type
        else:
            raise TypeError, "search_term_ must be a dict or a list of dicts, not a {}".format(
                type(search_term_))

        return slurp_info

    def existing_and_missing_uni(self, searchterm_list):
        """
        Takes a list of searchterm strings and returns a list of searchterm strings that were found in the db (in unicode)
        and a list of the searchterms that were missing from the found results
        """
        # make sure in utf8 before we send request to the db
        input_sts_utf8 = [to_utf8_or_bust(i) for i in searchterm_list]
        found_sts_info = self.get_slurp_info(input_sts_utf8)
        found_sts_uni = [
            to_unicode_or_bust(dict(i)['searchterm']) for i in found_sts_info
        ]
        input_sts_uni = [to_unicode_or_bust(i) for i in input_sts_utf8]
        missing_sts_uni = order_conserving.setdiff(input_sts_uni,
                                                   found_sts_uni)
        return found_sts_uni, missing_sts_uni

    def get_table(self, table_name):
        """
        Convenience method for client who may wish to get a specific table from the DynamoDB connection
        """
        return Table(table_name, connection=self.connection)

    def truncate_failed_slurp_table(self):
        """
        """
        with self.failed_slurps_table.batch_write() as batch:
            for item in self.failed_slurps_table.scan():
                batch.delete_item(searchterm=item['searchterm'])

    def truncate_slurp_table(self):
        """
        WARNING! Only use for test mode table
        """
        assert self.slurps_table.table_name == 'test_slurps', "Will only truncate test slurps table. To truncate production table, run code manually"
        test_slurps_table = Table('test_slurps', connection=self.connection)
        with test_slurps_table.batch_write() as batch:
            for item in self.slurps_table.scan():
                batch.delete_item(searchterm=item['searchterm'])

    def modify_failed_slurps_throughput(self, requested_read, requested_write):
        return self.modify_throughput(requested_read, requested_write,
                                      self.failed_slurps_table)

    def modify_slurps_throughput(self, requested_read, requested_write):
        return self.modify_throughput(requested_read, requested_write,
                                      self.slurps_table)

    def get_slurps_table_info(self):
        return self.get_table_info(self.slurps_table)

    def get_failed_slurps_table_info(self):
        return self.get_table_info(self.failed_slurps_table)
コード例 #48
0
ファイル: aws_dynamodb.py プロジェクト: ptrcode/cloudauto
class CloudTrailTable():
    """
    Class to represent the cloudtrail table in dynamodb
    """
    def __init__(self, table_name=settings.DYNAMODB_CLOUDTRAIL_TABLE):
        self.conn = dynamodb_connection()
        self.table = Table(table_name, connection=self.conn)

    def save_items(self, items, project_id):
        """
        Saves the items into db after 'dynamizing' them as a batch operation
        :param items: the list of items
        :param project_id: ProjectId hashkey for the table
        :return: None
        """
        # Parse every item in the response, add keys as per dynamodb, and
        # do a batch update
        geo_conn = geo_connection()
        with self.table.batch_write() as batch:
            for item in items:
                ctj = json.loads(item['CloudTrailEvent'])
                item['ProjectId'] = int(project_id)
                ip = ctj['sourceIPAddress']
                item['sourceIPAddress'] = ip
                item['countryCode'] = country_from_ip(ip, geo_conn=geo_conn)
                item['awsRegion'] = ctj['awsRegion']

                project_content_type_id = ContentType.objects.get_for_model(
                    ProjectAWS).pk
                project_object_id = int(project_id)
                # Signal the receiver for event names here
                cloudtrail_notification_signal.send(
                    sender=item['EventId'],
                    context_data=ctj,
                    project_content_type_id=project_content_type_id,
                    project_object_id=project_object_id)

                if settings.IS_DYNAMODB_LOCAL:
                    # We need to dynamize to store data in form of list, map etc
                    dy = types.Dynamizer()
                    for k, v in item.iteritems():
                        item[k] = dy.encode(v)

                batch.put_item(data=item)

    def delete_items(self, project_id, before_time):
        """
        Deletes the items before the given time
        :param before_time: time to query for items and delete
        :return: None
        """
        for item in self.table.query_2(ProjectId__eq=project_id,
                                       EventTime__lt=before_time,
                                       index='EventTime-index'):
            item.delete()

    def query_events(self, project_id):
        """
        Returns the items in the table for a given project id
        :param project_id: The hashkey project_id
        :return: table rows matching  the argument
        """
        return self.table.query_2(ProjectId__eq=project_id)
コード例 #49
0
class DynamoDBAdapter(key_value_store.KeyValueStore):

    """ Implementation of an abstract key-value store defined in
    key_value_store.py. The underlying database is amazon DynamoDB.

    The store keeps all objects in a single table with following schema:
    [HashKey('kind', data_type=STRING), RangeKey('id')]. 'kind' is the string
    with the object type ('vector', 'set' or 'int') and 'id' is the object id.
    The object value is stored in the 'value' attribute of the table items.

    The table should be created before this code is executed. Amazon
    configuration is assumed to be stored in ~/.boto file as described in
    http://boto.readthedocs.org/en/latest/boto_config_tut.html
    """

    def __init__(self, precision=np.dtype('float32'), table_name='test'):
        """ Create an instance of the dynamodb key-value store.
        precision - a numpy type, elements of all vectors are converted and
           stored in this type;
        table_name - the name of the DynamoDB table which keeps the objects.
        """
        conn = boto.dynamodb2.connect_to_region('eu-west-1')
        if not isinstance(precision, np.dtype):
            raise TypeError("Precision should be a numpy.dtype subtype")
        self.precision = precision
        self.precision_name = precision.name
        self.table = Table(table_name, connection=conn)

    def _get_or_create_item(self, kind, item_id):
        try:
            item = self.table.get_item(kind=kind, id=item_id)
        except ItemNotFound:
            item = Item(self.table)
            item['kind'] = kind
            item['id'] = item_id
        return item

    def _create_vector_item(self, vec_id, vector):
        item = self._get_or_create_item('vector', vec_id)
        item['value'] = Binary(vector.astype(self.precision).tostring())
        item['precision'] = self.precision_name
        return item

    def _vector_value(self, item):
        return np.fromstring(str(item['value']), np.dtype(item['precision']))

    def get_vector_ids(self):
        return [v['id'] for v in self.table.query_2(kind__eq='vector')]

    def get_int_ids(self):
        return [v['id'] for v in self.table.query_2(kind__eq='int')]

    def get_set_ids(self):
        return [v['id'] for v in self.table.query_2(kind__eq='set')]

    def store_vector(self, vec_id, vector):
        item = self._create_vector_item(vec_id, vector)
        item.save()

    def get_vector(self, vec_id):
        try:
            item = self.table.get_item(kind='vector', id=vec_id)
        except ItemNotFound:
            raise KeyError('Vector key %s is unknown' % (vec_id,))
        return self._vector_value(item)

    def bulk_get_vector(self, vec_ids):
        keys = [{'kind': 'vector', 'id': i} for i in vec_ids]
        vs = self.table.batch_get(keys=keys)
        return [self._vector_value(i) for i in vs]

    def remove_vector(self, vec_id):
        try:
            item = self.table.get_item(kind='vector', id=vec_id)
        except ItemNotFound:
            raise KeyError('Vector key %s is unknown' % (vec_id,))
        item.delete()

    def add_to_set(self, set_id, element_id):
        item = self._get_or_create_item('set', set_id)
        if 'value' not in item.keys() or not isinstance(item['value'], set):
            item['value'] = set()
        item['value'].add(element_id)
        item.save(overwrite=True)

    def remove_from_set(self, set_id, element_id):
        try:
            item = self.table.get_item(kind='set', id=set_id)
        except ItemNotFound:
            raise KeyError('Set key %s is unknown' % (set_id,))
        if 'value' not in item.keys() or not isinstance(item['value'], set):
            raise KeyError('Incorrect value in item %s' % (set_id,))
        if element_id not in item['value']:
            raise KeyError('Element %s not in set %s' % (element_id, set_id))
        item['value'].remove(element_id)
        item.save()

    def remove_set(self, set_id):
        try:
            item = self.table.get_item(kind='set', id=set_id)
            item.delete()
        except ItemNotFound:
            raise KeyError('Set key %s is unknown' % (set_id,))

    def get_set(self, set_id):
        try:
            the_set = self.table.get_item(kind='set', id=set_id)['value']
            return set([str(entry) for entry in the_set])
        except ItemNotFound:
            raise KeyError('Set key %s is unknown' % (set_id,))

    def store_int(self, int_id, integer):
        item = self._get_or_create_item('int', int_id)
        item['value'] = integer
        item.save()

    def get_int(self, int_id):
        try:
            return int(self.table.get_item(kind='int', id=int_id)['value'])
        except ItemNotFound:
            raise KeyError('Int key %s is unknown' % (int_id,))

    def remove_int(self, int_id):
        try:
            item = self.table.get_item(kind='int', id=int_id)
        except ItemNotFound:
            raise KeyError('Int key %s is unknown' % (int_id,))
        item.delete()

    def _aggregate_set_id_element_pairs(self, setpairs):
        """Turns a list of pairs of the form (set_id, element_id) into a list 'L' of
        pairs 'p' of the form (set_id, set_of_element_ids). 'L' has the property
        that if 'p' and 'q' are distinct entries in 'L', then p[0] and q[0] are
        also distinct."""
        set_ids = set([entry[0] for entry in setpairs])
        listlist = [[entry for entry in setpairs if entry[0] == set_id]
                    for set_id in set_ids]
        result = [(pairlist[0][0], set([entry[1] for entry in pairlist]))
                  for pairlist in listlist]
        return result

    def bulk_store_vector(self, vec_ids, vectors):
        if len(vec_ids) != len(vectors):
            raise ValueError
        vecpairs = zip(vec_ids, vectors)
        with self.table.batch_write() as batch:
            for vec_id, vec in vecpairs:
                item = self._create_vector_item(vec_id, vec)
                batch.put_item(item)

    def bulk_store_vector_old(self, vectors_df):
        """Argument 'vectors' is a dataframe with index vector ids."""
        if len(vec_ids) != len(vectors):
            raise ValueError
        with self.table.batch_write() as batch:
            for ind in vectors_df.index:
                vec_id = str(ind)
                vec = vectors_df.loc[ind].values
                item = self._create_vector_item(vec_id, vec)
                batch.put_item(item)

    def bulk_store_int(self, int_ids, integers):
        """Argument 'intpairs' is a list of pairs of the form (int_id, integer)."""
        if len(int_ids) != len(integers):
            raise ValueError
        intpairs = zip(int_ids, integers)
        with self.table.batch_write() as batch:
            for pair in intpairs:
                int_id, integer = pair
                item = self._get_or_create_item('int', int_id)
                item['value'] = integer
                batch.put_item(item)

    def bulk_add_to_set(self, set_ids, element_ids):
        """batch_write() objects if the same item is written to more
        than once per batch, hence we aggregate all (set_id, element_id)
        pairs into a list of pairs (set_id, element_ids), where
        the 'set_id's are pairwise distinct, and the 'element_ids'
        are sets."""
        if len(set_ids) != len(element_ids):
            raise ValueError
        setpairs = zip(set_ids, element_ids)
        setlist = self._aggregate_set_id_element_pairs(setpairs)
        with self.table.batch_write() as batch:
            for pair in setlist:
                set_id, element_ids = pair
                item = self._get_or_create_item('set', set_id)
                if 'value' not in item.keys() or not isinstance(
                        item['value'], set):
                    item['value'] = set()
                item['value'].update(element_ids)
                batch.put_item(item)
コード例 #50
0
ファイル: storeNYTIntodynamo.py プロジェクト: wang2731/MIDS
                            "rb",
                            encoding="utf-8")

    except:
        logging.debug("could not open file" + filename)
        exit()

    data = json.loads(filer.read())

    filer.close()

    for dayArticle in data:  #for each day
        if (dayArticle["response"] != None):
            #articles are bundled in group of 10 (because that is how much article per request NYT API was returning.
            # will batch write for better performance and thoroughput saving.
            with nyt.batch_write() as batch:

                for doc in dayArticle["response"]['docs']:
                    logging.debug("processing articleId " + doc['_id'] +
                                  " published at " + doc['pub_date'])

                    #must replace all empty string b/c otherwise dynamodb complains
                    doc = replaceEmptyString2(doc)

                    convertedDate = dateutil.parser.parse(doc['pub_date'])

                    #saving datetime as timestamp
                    timestamp = time.mktime(
                        (convertedDate.year, convertedDate.month,
                         convertedDate.day, convertedDate.hour,
                         convertedDate.minute, convertedDate.second, -1, -1,