class ExportConfiguration(DocumentSchema): """ Just a way to configure a single export. Used in the group export config. """ index = JsonProperty() name = StringProperty() format = StringProperty() @property def filename(self): return "%s.%s" % (self.name, Format.from_format(self.format).extension) @property def type(self): # hack - make this backwards compatible with form/case categorization # these might only exist in the care-bihar domain or wherever else # they've been manually created in the DB. try: return 'form' if 'http:' in self.index[1] else 'case' except IndexError: # arbitrarily choose default so it doesn't stay hidden from the UI forever. return 'form' def __repr__(self): return ('%s (%s)' % (self.name, self.index)).encode('utf-8')
class SavedExportSchema(BaseSavedExportSchema, UnicodeMixIn): """ Lets you save an export format with a schema and list of columns and display names. """ name = StringProperty() default_format = StringProperty() is_safe = BooleanProperty(default=False) # Is the export de-identified? # self.index should always match self.schema.index # needs to be here so we can use in couch views index = JsonProperty() # id of an ExportSchema for checkpointed schemas schema_id = StringProperty() # user-defined table configuration tables = SchemaListProperty(ExportTable) # For us right now, 'form' or 'case' type = StringProperty() # ID of the new style export that it was converted to converted_saved_export_id = StringProperty() def __unicode__(self): return "%s (%s)" % (self.name, self.index) def transform(self, doc): return doc @property def global_transform_function(self): # will be called on every value in the doc during export return identity @property @memoized def schema(self): return ExportSchema.get(self.schema_id) @property def table_name(self): return self.sheet_name if self.sheet_name else "%s" % self._id @classmethod def default(cls, schema, name="", type='form'): return cls(name=name, index=schema.index, schema_id=schema.get_id, tables=[ExportTable.default(schema.tables[0][0])], type=type) @property @memoized def tables_by_index(self): return dict([t.index, t] for t in self.tables) def get_table_configuration(self, index): def column_configuration(): columns = self.schema.get_columns(index) if index in self.tables_by_index: return list( self.tables_by_index[index].get_column_configuration( columns)) else: return [ ExportColumn(index=c, display='').to_config_format(selected=False) for c in columns ] def display(): if index in self.tables_by_index: return self.tables_by_index[index].display else: return '' return { "index": index, "display": display(), "column_configuration": column_configuration(), "selected": index in self.tables_by_index } def get_table_headers(self, override_name=False): return ((self.table_name if override_name and i == 0 else t.index, [t.get_headers_row()]) for i, t in enumerate(self.tables)) @property def table_configuration(self): return [ self.get_table_configuration(index) for index, cols in self.schema.tables ] def update_schema(self): """ Update the schema for this object to include the latest columns from any relevant docs. Does NOT save the doc, just updates the in-memory object. """ from couchexport.schema import build_latest_schema schema = build_latest_schema(self.index) if schema: self.set_schema(schema) def set_schema(self, schema): """ Set the schema for this object. Does NOT save the doc, just updates the in-memory object. """ self.schema_id = schema.get_id def trim(self, document_table, doc, apply_transforms=True): tables = [] for table_index, data in document_table: if table_index in self.tables_by_index: # todo: currently (index, rows) instead of (display, rows); where best to convert to display? tables.append( (table_index, self.tables_by_index[table_index].trim( data, doc, apply_transforms, self.global_transform_function))) return tables def get_export_components(self, previous_export_id=None, filter=None): from couchexport.export import ExportConfiguration database = get_db() config = ExportConfiguration(database, self.index, previous_export_id, self.filter & filter) # get and checkpoint the latest schema updated_schema = config.get_latest_schema() export_schema_checkpoint = config.create_new_checkpoint() return config, updated_schema, export_schema_checkpoint def get_export_files(self, format=None, previous_export=None, filter=None, process=None, max_column_size=None, apply_transforms=True, limit=0, **kwargs): from couchexport.export import get_writer, get_formatted_rows if not format: format = self.default_format or Format.XLS_2007 config, updated_schema, export_schema_checkpoint = self.get_export_components( previous_export, filter) # transform docs onto output and save writer = get_writer(format) # open the doc and the headers formatted_headers = list(self.get_table_headers()) fd, path = tempfile.mkstemp() with os.fdopen(fd, 'wb') as tmp: writer.open(formatted_headers, tmp, max_column_size=max_column_size, table_titles=dict([(table.index, table.display) for table in self.tables if table.display])) total_docs = len(config.potentially_relevant_ids) if process: DownloadBase.set_progress(process, 0, total_docs) for i, doc in config.enum_docs(): if limit and i > limit: break if self.transform and apply_transforms: doc = self.transform(doc) formatted_tables = self.trim(get_formatted_rows(doc, updated_schema, separator="."), doc, apply_transforms=apply_transforms) writer.write(formatted_tables) if process: DownloadBase.set_progress(process, i + 1, total_docs) writer.close() if format == Format.PYTHON_DICT: return writer.get_preview() return ExportFiles(path, export_schema_checkpoint, format) def get_preview_data(self, export_filter, limit=50): return self.get_export_files(Format.PYTHON_DICT, None, export_filter, limit=limit) def download_data(self, format="", previous_export=None, filter=None, limit=0): """ If there is data, return an HTTPResponse with the appropriate data. If there is not data returns None. """ from couchexport.shortcuts import export_response files = self.get_export_files(format, previous_export, filter, limit=limit) return export_response(files.file, files.format, self.name) def to_export_config(self): """ Return an ExportConfiguration object that represents this. """ # confusingly, the index isn't the actual index property, # but is the index appended with the id to this document. # this is to avoid conflicts among multiple exports index = "%s-%s" % (self.index, self._id) if isinstance(self.index, six.string_types) else \ self.index + [self._id] # self.index required to be a string or list return ExportConfiguration(index=index, name=self.name, format=self.default_format) def custom_validate(self): if self.default_format == Format.XLS: for table in self.tables: if len(table.columns) > 255: raise CustomExportValidationError( "XLS files can only have 255 columns") # replaces `sheet_name = StringProperty()` def __get_sheet_name(self): return self.tables[0].display def __set_sheet_name(self, value): self.tables[0].display = value sheet_name = property(__get_sheet_name, __set_sheet_name) @classmethod def wrap(cls, data): # since this is a property now, trying to wrap it will fail hard if 'sheet_name' in data: del data['sheet_name'] return super(SavedExportSchema, cls).wrap(data)
class ExportSchema(Document, UnicodeMixIn): """ An export schema that can store intermittent contents of the export so that the entire doc list doesn't have to be used to generate the export """ index = JsonProperty() schema = DictProperty() timestamp = TimeStampProperty() def __unicode__(self): return "%s: %s" % (json.dumps(self.index), self.timestamp) @classmethod def wrap(cls, data): if data.get('timestamp', '').startswith('1-01-01'): data['timestamp'] = '1970-01-01T00:00:00Z' return super(ExportSchema, cls).wrap(data) @classmethod def last(cls, index): return cls.view( "couchexport/schema_checkpoints", startkey=[json.dumps(index), {}], endkey=[json.dumps(index)], descending=True, limit=1, include_docs=True, reduce=False, ).one() @classmethod def get_all_checkpoints(cls, index): doc_ids = [ result["id"] for result in cls.get_db().view( "couchexport/schema_checkpoints", startkey=[json.dumps(index)], endkey=[json.dumps(index), {}], reduce=False, ) ] for doc in iter_docs(cls.get_db(), doc_ids): yield cls.wrap(doc) _tables = None @property def tables(self): if self._tables is None: from couchexport.export import get_headers headers = get_headers(self.schema, separator=".") self._tables = [(index, row[0]) for index, row in headers] return self._tables @property def table_dict(self): return dict(self.tables) def get_columns(self, index): return ['id'] + self.table_dict[index].data def get_all_ids(self, database=None): database = database or self.get_db() return set([ result['id'] for result in database.view( "couchexport/schema_index", reduce=False, **get_schema_index_view_keys(self.index)).all() ]) def get_new_ids(self, database=None): database = database or self.get_db() assert self.timestamp, 'exports without timestamps are no longer supported.' tag_as_list = force_tag_to_list(self.index) startkey = tag_as_list + [self.timestamp.isoformat()] endkey = tag_as_list + [{}] return set([ result['id'] for result in database.view("couchexport/schema_index", reduce=False, startkey=startkey, endkey=endkey) ]) def get_new_docs(self, database=None): return iter_docs(self.get_new_ids(database))
class DefaultExportSchema(BaseSavedExportSchema): index = JsonProperty() type = StringProperty() @property def name(self): return self.index @property def indices(self): return [self.index] def parse_headers(self, headers): first_header = headers[0][1] return [(self.table_name, first_header)] def remap_tables(self, tables): # can be overridden to rename/remove default stuff from exports return tables def get_export_components(self, previous_export_id=None, filter=None): from couchexport.export import get_export_components return get_export_components(self.index, previous_export_id, filter=self.filter & filter) def get_export_files(self, format='', previous_export_id=None, filter=None, use_cache=True, max_column_size=2000, separator='|', process=None, **kwargs): # the APIs of how these methods are broken down suck, but at least # it's DRY from couchexport.export import get_writer, get_export_components, get_headers, get_formatted_rows from django.core.cache import cache import hashlib export_tag = self.index CACHE_TIME = 1 * 60 * 60 # cache for 1 hour, in seconds def _build_cache_key(tag, prev_export_id, format, max_column_size): def _human_readable_key(tag, prev_export_id, format, max_column_size): return "couchexport_:%s:%s:%s:%s" % (tag, prev_export_id, format, max_column_size) return hashlib.md5( _human_readable_key( tag, prev_export_id, format, max_column_size).encode('utf-8')).hexdigest() # check cache, only supported for filterless queries, currently cache_key = _build_cache_key(export_tag, previous_export_id, format, max_column_size) if use_cache and filter is None: cached_data = cache.get(cache_key) if cached_data: (tmp, checkpoint) = cached_data return ExportFiles(tmp, checkpoint) fd, path = tempfile.mkstemp() with os.fdopen(fd, 'wb') as tmp: schema_index = export_tag config, updated_schema, export_schema_checkpoint = get_export_components( schema_index, previous_export_id, filter) if config: writer = get_writer(format) # get cleaned up headers formatted_headers = self.remap_tables( get_headers(updated_schema, separator=separator)) writer.open(formatted_headers, tmp, max_column_size=max_column_size) total_docs = len(config.potentially_relevant_ids) if process: DownloadBase.set_progress(process, 0, total_docs) for i, doc in config.enum_docs(): if self.transform: doc = self.transform(doc) writer.write( self.remap_tables( get_formatted_rows(doc, updated_schema, include_headers=False, separator=separator))) if process: DownloadBase.set_progress(process, i + 1, total_docs) writer.close() checkpoint = export_schema_checkpoint if checkpoint: if use_cache: cache.set(cache_key, (path, checkpoint), CACHE_TIME) return ExportFiles(path, checkpoint) return None
class ExportSchema(Document, UnicodeMixIn): """ An export schema that can store intermittent contents of the export so that the entire doc list doesn't have to be used to generate the export """ index = JsonProperty() seq = StringProperty() # semi-deprecated schema = DictProperty() timestamp = TimeStampProperty() def __unicode__(self): return "%s: %s" % (json.dumps(self.index), self.seq) @property def is_bigcouch(self): try: int(self.seq) return False except ValueError: return True @classmethod def wrap(cls, data): if isinstance(data.get('seq'), (int, long)): data['seq'] = unicode(data['seq']) ret = super(ExportSchema, cls).wrap(data) if not ret.timestamp: # these won't work on bigcouch so we want to know if this happens notify_exception( None, 'an export without a timestamp was accessed! %s (%s)' % (ret.index, ret._id)) # this isn't the cleanest nor is it perfect but in the event # this doc traversed databases somehow and now has a bad seq # id, make sure to just reset it to 0. # This won't catch if the seq is bad but not greater than the # current one). current_seq = cls.get_db().info()["update_seq"] try: if int(current_seq) < int(ret.seq): ret.seq = "0" ret.save() except ValueError: # seqs likely weren't ints (e.g. bigcouch) # this should never be possible (anything on bigcouch should # have a timestamp) so let's fail hard raise Exception( 'export %s is in a bad state (no timestamp or integer seq)' % ret._id) # TODO? handle seq -> datetime migration return ret @classmethod def last(cls, index): # search first by timestamp, then fall back to seq id shared_kwargs = { 'descending': True, 'limit': 1, 'include_docs': True, 'reduce': False, } ret = cls.view("couchexport/schema_checkpoints", startkey=['by_timestamp', json.dumps(index), {}], endkey=['by_timestamp', json.dumps(index)], **shared_kwargs).one() if ret and not ret.timestamp: # we found a bunch of old checkpoints but they only # had seq ids, so use those instead ret = cls.view("couchexport/schema_checkpoints", startkey=['by_seq', json.dumps(index), {}], endkey=['by_seq', json.dumps(index)], **shared_kwargs).one() return ret @classmethod def get_all_indices(cls): ret = cls.get_db().view("couchexport/schema_checkpoints", startkey=['by_timestamp'], endkey=['by_timestamp', {}], reduce=True, group=True, group_level=2) for row in ret: index = row['key'][1] try: yield json.loads(index) except ValueError: # ignore this for now - should just be garbage data # print "poorly formatted index key %s" % index pass @classmethod def get_all_checkpoints(cls, index): return cls.view("couchexport/schema_checkpoints", startkey=['by_timestamp', json.dumps(index)], endkey=['by_timestamp', json.dumps(index), {}], include_docs=True, reduce=False) _tables = None @property def tables(self): if self._tables is None: from couchexport.export import get_headers headers = get_headers(self.schema, separator=".") self._tables = [(index, row[0]) for index, row in headers] return self._tables @property def table_dict(self): return dict(self.tables) def get_columns(self, index): return ['id'] + self.table_dict[index].data def get_all_ids(self, database=None): database = database or self.get_db() return set([ result['id'] for result in database.view( "couchexport/schema_index", reduce=False, **get_schema_index_view_keys(self.index)).all() ]) def get_new_ids(self, database=None): # TODO: deprecate/remove old way of doing this database = database or self.get_db() if self.timestamp: return self._ids_by_timestamp(database) else: return self._ids_by_seq(database) def _ids_by_seq(self, database): if self.seq == "0" or self.seq is None: return self.get_all_ids() consumer = Consumer(database) view_results = consumer.fetch(since=self.seq) if view_results: include_ids = set([res["id"] for res in view_results["results"]]) return include_ids.intersection(self.get_all_ids()) else: # sometimes this comes back empty. I think it might be a bug # in couchdbkit, but it's impossible to consistently reproduce. # For now, just assume this is fine. return set() def _ids_by_timestamp(self, database): tag_as_list = force_tag_to_list(self.index) startkey = tag_as_list + [self.timestamp.isoformat()] endkey = tag_as_list + [{}] return set([ result['id'] for result in database.view("couchexport/schema_index", reduce=False, startkey=startkey, endkey=endkey) ]) def get_new_docs(self, database=None): return iter_docs(self.get_new_ids(database))