def ImportCSV(app, kind, key_column, body_file, metadata_entity, user): error = False reader = csv.DictReader(body_file) # Hack to support Python 2.5 / 2.6 if reader.fieldnames is None: reader.fieldnames = reader.reader.next() for f in reader.fieldnames: if ' ' in f or f[0:1].isdigit() or f[0] == '-': logging.error('Invalid field name: ' + f) error = True if (key_column and key_column not in reader.fieldnames): error = True if not key_column: key_column = reader.fieldnames[0] if error: return -1 rows = 0 for row in reader: key = row[key_column] if "kind" in row: del row["kind"] if "app" in row: del row["app"] if "key" in row: del row["key"] for r in row: if row[r] == NULL_VALUE: row[r] = None assert r is not None, \ "Could not split CSV row properly: row field contains an extra comma: " + str(row) all_null = True for r in row: if r == key_column: continue elif row[r] != None: all_null = False break if all_null: row[key_column] = None CleanUpFormats(row) datastore.RunInTransaction(store.update_entity, app, kind, key, row, metadata_entity, user) rows = rows + 1 return rows
def RunInTransaction(self, func, *args, **kwds): """Run the pass function in a transaction. Blocks other changes to the storage. Args: func: a function reference args: the positional arguments list kwds: the keyword arguments dict Raises: score_ranker.TransactionFailedError if transaction failed """ return datastore.RunInTransaction(func, *args, **kwds)
def begin_processing(self, operation, operation_method, entities_per_task, queue): BATCH_SIZE = 3 # Unpickle the source query query = cPickle.loads(str(self["query"])) def txn(): try: marker = datastore.Get(self.key()) marker.__class__ = ShardedTaskMarker queued_shards = marker[ShardedTaskMarker.QUEUED_KEY] processing_shards = marker[ShardedTaskMarker.RUNNING_KEY] queued_count = len(queued_shards) for j in xrange(min(BATCH_SIZE, queued_count)): pickled_shard = queued_shards.pop() processing_shards.append(pickled_shard) shard = cPickle.loads(str(pickled_shard)) deferred.defer( self.run_shard, query, shard, operation, operation_method, entities_per_task=entities_per_task, # Defer this task onto the correct queue with `_queue`, passing the `queue` # parameter back to the function again so that it can do the same next time queue=queue, _queue=queue, _transactional=True, ) marker.put() except datastore_errors.EntityNotFoundError: logging.error( "Unable to start task %s as marker is missing", self.key().id_or_name() ) return # Reload the marker (non-transactionally) and defer the shards in batches # transactionally. If this task fails somewhere, it will resume where it left off marker = datastore.Get(self.key()) for i in xrange(0, len(marker[ShardedTaskMarker.QUEUED_KEY]), BATCH_SIZE): datastore.RunInTransaction(txn)
def GetOrInsert(key, kindName=None, parent=None, **kwargs): """ Either creates a new entity with the given key, or returns the existing one. Its guaranteed that there is no race-condition here; it will never overwrite an previously created entity. Extra keyword arguments passed to this function will be used to populate the entity if it has to be created; otherwise they are ignored. :param key: The key which will be fetched or created. \ If key is a string, it will be used as the name for the new entity, therefore the \ collectionName is required in this case. :type key: server.db.Key | String :param kindName: The data kind to use for that entity. Ignored if key is a db.Key. :type kindName: str :param parent: The parent entity of the entity. :type parent: db.Key or None :returns: Returns the wanted Entity. :rtype: server.db.Entity """ def txn(key, kwargs): try: res = datastore.Get(key) except datastore_errors.EntityNotFoundError: res = Entity(kind=key.kind(), parent=key.parent(), name=key.name(), id=key.id()) for k, v in kwargs.items(): res[k] = v datastore.Put(res) return (res) if not isinstance(key, datastore_types.Key): try: key = datastore_types.Key(encoded=key) except: assert kindName key = datastore_types.Key.from_path(kindName, key, parent=parent) if datastore.IsInTransaction(): return txn(key, kwargs) return datastore.RunInTransaction(txn, key, kwargs)
def start_mapping( identifier, query, operation, operation_method=None, shard_count=None, entities_per_task=None, queue=None ): """ This must *transactionally* defer a task which will call `operation._wrapped_map_entity` on all entities of the given `kind` in the given `namespace` and will then transactionally update the entity of the given `task_marker_key_key` with `is_finished=True` after all entities have been mapped. """ shard_count = shard_count or getattr(settings, "DJANGAE_MIGRATION_DEFAULT_SHARD_COUNT", 32) shards_to_run = shard_query(query, shard_count) queue = queue or getattr(settings, "DJANGAE_MIGRATION_DEFAULT_QUEUE", _DEFAULT_QUEUE) def txn(shards): marker_key = ShardedTaskMarker.get_key(identifier, query._Query__namespace) try: datastore.Get(marker_key) # If the marker already exists, don't do anything - just return return except datastore_errors.EntityNotFoundError: pass marker = ShardedTaskMarker(identifier, query, namespace=query._Query__namespace) if shards: for shard in shards: marker["shards_queued"].append(cPickle.dumps(shard)) else: # No shards, then there is nothing to do! marker["is_finished"] = True marker["time_started"] = datetime.utcnow() marker.put() if not marker["is_finished"]: deferred.defer( marker.begin_processing, operation, operation_method, entities_per_task, queue, _transactional=True, _queue=queue ) return marker_key return datastore.RunInTransaction(txn, shards_to_run)
def ImportSplitFile(app, body_file): rows = 0 segments = body_file.getvalue().split("\n\n") for seg in segments: buf = StringIO.StringIO(seg) header = buf.readline().lstrip("#") kind = header.strip() reader = csv.DictReader(buf) # Hack to support Python 2.5 / 2.6 if reader.fieldnames is None: reader.fieldnames = reader.reader.next() for row in reader: key = row[reader.fieldnames[0]] CleanUpFormats(row) datastore.RunInTransaction(store.update_entity, app, kind, key, row) rows = rows + 1 return rows
def run_shard( self, original_query, shard, operation, operation_method=None, offset=0, entities_per_task=None, queue=_DEFAULT_QUEUE ): """ Given a datastore.Query which does not have any high/low bounds on it, apply the bounds of the given shard (which is a pair of keys), and run either the given `operation` (if it's a function) or the given method of the given operation (if it's an object) on each entity that the query returns, starting at entity `offset`, and redeferring every `entities_per_task` entities to avoid hitting DeadlineExceededError. Tries (but does not guarantee) to avoid processing the same entity more than once. """ entities_per_task = entities_per_task or getattr( settings, "DJANGAE_MIGRATION_DEFAULT_ENTITIES_PER_TASK", 100 ) if operation_method: function = getattr(operation, operation_method) else: function = operation marker = datastore.Get(self.key()) if cPickle.dumps(shard) not in marker[ShardedTaskMarker.RUNNING_KEY]: return # Copy the query so that we can re-defer the original, unadulterated version, because once # we've applied limits and ordering to the query it causes pickle errors with defer. query = copy.deepcopy(original_query) query.Order("__key__") query["__key__ >="] = shard[0] query["__key__ <"] = shard[1] num_entities_processed = 0 try: results = query.Run(offset=offset, limit=entities_per_task) for entity in results: function(entity) num_entities_processed += 1 if num_entities_processed >= entities_per_task: raise Redefer() except (DeadlineExceededError, Redefer): # By keeping track of how many entities we've processed, we can (hopefully) avoid # re-processing entities if we hit DeadlineExceededError by redeferring with the # incremented offset. But note that if we get crushed by the HARD DeadlineExceededError # before we can redefer, then the whole task will retry and so entities will get # processed twice. deferred.defer( self.run_shard, original_query, shard, operation, operation_method, offset=offset+num_entities_processed, entities_per_task=entities_per_task, # Defer this task onto the correct queue (with `_queue`), passing the `queue` # parameter back to the function again so that it can do the same next time queue=queue, _queue=queue, ) return # This is important! # Once we've run the operation on all the entities, mark the shard as done def txn(): pickled_shard = cPickle.dumps(shard) marker = datastore.Get(self.key()) marker.__class__ = ShardedTaskMarker marker[ShardedTaskMarker.RUNNING_KEY].remove(pickled_shard) marker[ShardedTaskMarker.FINISHED_KEY].append(pickled_shard) marker.put() datastore.RunInTransaction(txn)
def run_in_transaction(self, func, *args, **kw): return datastore.RunInTransaction(func, *args, **kw)
def transactional_operation(*args, **kwargs): return datastore.RunInTransaction(operation, *args, **kwargs)
def post(self): metadata_entity = store.GetMetadataEntity(self.request) auth_level = store.GetAuthLevel(self.request, metadata_entity) (app, kind, id) = store.extract_path(self.request.path) if not store.IsEncryptionSufficient(self.request, metadata_entity): self.response.set_status(403) self.response.clear() return if not store.IsAuthorized(app, kind, id, auth_level, store.WRITE): self.response.set_status(401) self.response.clear() return user = store.GetUser(self.request) #logging.info("%s, %s, %s" % (app, kind, id)) data = self.request.body_file.getvalue() data_obj = {} if (len(data) == 0): return if (app is not None and kind is not None and id is not None): try: data_obj = json.loads(data, use_decimal=True) except json.JSONDecodeError: self.response.set_status(500) self.response.clear() return datastore.RunInTransaction(store.update_entity, app, kind, id, data_obj, metadata_entity, user) elif self.request.headers['Content-type'].startswith( 'text/csv') and id is None: key_column = self.request.get('key', None) result = csv_import.ImportCSV(app, kind, key_column, self.request.body_file, metadata_entity, user) if result == -1: self.response.set_status(500) self.response.clear() return self.response.out.write(result) self.response.out.write("\n") elif (app is not None and kind is not None and id is None): try: data_obj = json.loads(data, use_decimal=True) except json.JSONDecodeError: self.response.set_status(500) self.response.clear() return if not isinstance(data_obj, list): self.response.set_status(500) self.response.clear() return for data in data_obj: if not 'key' in data: self.response.set_status(500) self.response.clear() return count = 0 for data in data_obj: datastore.RunInTransaction(store.update_entity, app, kind, data['key'], data, metadata_entity, user) count += 1 self.response.out.write(count) self.response.out.write("\n")