def __init__( self, svn_repository=None, svn_repository_local_copy=None, dsn=None, user='******', images_cache_local=None, images_cache_url=None, ): assert user self.svn_repository = SVNRepository(svn_repository=svn_repository, svn_repository_local_copy=svn_repository_local_copy) print 'initializing %s' % self self.db = DBRepository( dsn=dsn, user=user, repository=self, ) self.db.repository = self if images_cache_local: try: msg = 'this path (for "images_cache_local") does not exist; %s' % images_cache_local assert os.path.exists(images_cache_local), msg except: print msg self.images_cache_local = images_cache_local self.images_cache_url = images_cache_url self.user = user
class Repository(object): ENABLE_SVN = False ENABLE_DB = True def __init__( self, svn_repository=None, svn_repository_local_copy=None, dsn=None, user='******', images_cache_local=None, images_cache_url=None, ): assert user self.svn_repository = SVNRepository(svn_repository=svn_repository, svn_repository_local_copy=svn_repository_local_copy) print 'initializing %s' % self self.db = DBRepository( dsn=dsn, user=user, repository=self, ) self.db.repository = self if images_cache_local: try: msg = 'this path (for "images_cache_local") does not exist; %s' % images_cache_local assert os.path.exists(images_cache_local), msg except: print msg self.images_cache_local = images_cache_local self.images_cache_url = images_cache_url self.user = user def get_bioport_ids(self): """return _all_ bioport_ids in the system""" return self.db.get_bioport_ids() def get_person(self, bioport_id): return self.db.get_person(bioport_id=bioport_id, repository=self) def count_persons(self, **args): return self.db.count_persons(**args) def get_persons(self, **args): """Get persons satisfying the given arguments arguments: order_by - a string - default is 'sort_key' returns: a PersonList instance - a list of Person instances """ return self.db.get_persons(**args) def get_persons_sequence(self, *args, **kwargs): return self.db.get_persons_sequence(*args, **kwargs) def get_bioport_id(self, url_biography): return self.db.get_bioport_id(url_biography=url_biography) def delete_person(self, person): if self.ENABLE_DB: return self.db.delete_person(person) if self.ENABLE_SVN: raise NotImplementedError() def count_biographies(self, **args): return self.db.count_biographies(**args) def get_biographies(self, **args): if self.ENABLE_DB: return self.db.get_biographies(**args) elif self.ENABLE_SVN: raise NotImplementedError() def get_biography(self, local_id=None, **args): return self.db.get_biography(local_id=local_id, **args) def redirects_to(self, bioport_id): return self.db.redirects_to(bioport_id) def add_source(self, source): """add a source of data to the db""" if source.id in [src.id for src in self.get_sources()]: raise ValueError('A source with id %s already exists' % source.id) self.db.add_source(source) return source def delete_source(self, source): return self.db.delete_source(source) def get_source(self, id): # @ReservedAssignment ls = [src for src in self.get_sources() if src.id == id] if not ls: raise ValueError('No source found with id %s\nAvailabe sources are %s' % (id, [s.id for s in self.get_sources()])) source = ls[0] return source def get_sources(self, order_by='quality', desc=True): """ return: a list of Source instances """ return self.db.get_sources(order_by=order_by, desc=desc) def get_status_value(self, k, default=None): items = STATUS_VALUES return dict(items).get(k, default) def get_status_values(self): return STATUS_VALUES def get_source_types(self): return SOURCE_TYPES def get_religion_values(self): return RELIGION_VALUES def get_author(self, author_id): if self.ENABLE_DB: return self.db.get_author(author_id) raise NotImplementedError def save(self, x): if x.__class__ == Biography: self.save_biography(x) elif x.__class__ == Source: self.save_source(x) else: raise TypeError('Cannot save a object %s in the repository: unknown type' % x) def save_source(self, source): source.repository = self if self.ENABLE_DB: self.db.save_source(source) if self.ENABLE_SVN: raise NotImplementedError() def save_person(self, person): if self.ENABLE_DB: self.db.save_person(person) if self.ENABLE_SVN: raise NotImplementedError() def save_biography(self, biography, comment=''): biography.repository = self if self.ENABLE_DB: biography = self.db.save_biography(biography, user=self.user, comment=comment) if self.ENABLE_SVN: raise NotImplementedError() return biography def detach_biography(self, biography): person = self.db.detach_biography(biography) return person def delete_biographies(self, source): sources_ids = [src.id for src in self.get_sources()] if source.id not in sources_ids: raise ValueError("no source with id %s was found" % source.id) else: if self.ENABLE_DB: self.db.delete_biographies(source) if self.ENABLE_SVN: raise NotImplementedError def delete_biography(self, biography): return self.db.delete_biography(biography) def download_biographies(self, source, limit=None): """Download all biographies from source.url and add them to the repository. Mark any biographies that we did not find (anymore), by removing the source_url property. Return the number of total and skipped biographies. arguments: source: a Source instance returns: a list of biography instances """ # at the URL given we find a list of links to biodes files # print 'Opening', source.url assert source.url, 'No URL was defined with the source "%s"' % source.id logging.info('downloading data at %s' % source.url) logging.info('parsing source url') # TODO: perhaps it would be better to check on Source.__init__ if repository argument is given if not source.repository: source.repository = self try: ls = biodes.parse_list(source.url) if limit: ls = ls[:limit] except etree.XMLSyntaxError, error: # @UndefinedVariable raise BioPortException('Error parsing data at %s -- check if this is valid XML\n%s' % (source.url, error)) if not ls: raise BioPortException('The file at %s does not contain any links to biographies' % source.url) # we have a valid list of biographies to download # first we remove all previously imported biographies at this source logging.info('deleting existing biographies from %s' % source) self.delete_biographies(source=source) logging.info('downloading biodes files') total = len(ls) skipped = 0 ls.sort() for iteration, biourl in enumerate(ls): iteration += 1 if not biourl.startswith("http:"): # we're dealing with a fs path biourl = os.path.normpath(biourl) if not os.path.isabs(biourl): biourl = os.path.join(os.path.dirname(source.url), biourl) if limit and iteration > limit: break logging.info('progress %s/%s: adding biography at %s' % (iteration, len(ls), biourl)) # create a Biography object bio = Biography(source_id=source.id, repository=source.repository) bio.from_url(biourl) bio.save(user='', comment=u'downloaded biography from source %s' % source) # remove the temp directory which has been used to extract # the xml files if ls[0].startswith("/tmp/"): shutil.rmtree(os.path.dirname(ls[0])) s = '%s biographies downloaded from source %s' % (iteration, source.id) logging.info(s) source.last_bios_update = time.time() self.save_source(source) # logging.info('deleting orphaned persons') # self.delete_orphaned_persons(source_id=source.id) return total, skipped