def extract_shelfmarks(args): settings = get_appsettings(args.configuration) setup_logging(args.configuration) logger = logging.getLogger('explorethestacks') logger.info('Extracting shelf-marks') engine = engine_from_config(settings, 'sqlalchemy.') DBSession.configure(bind=engine) dbsession = DBSession() count = 0 with transaction.manager: for shelfmark in dbsession.query(ShelfMark): shelfmark.books = [] with transaction.manager: dbsession.query(ShelfMark).delete() with transaction.manager: for book in dbsession.query(Book): for title in book.attrs['shelfmarks']: shelfmark = dbsession.query(ShelfMark).filter( ShelfMark.title == title).first() if not shelfmark: shelfmark = ShelfMark(title=title) dbsession.add(shelfmark) shelfmark.books.append(book) count = count + 1 if count % 10000 == 0: dbsession.flush() logger.debug('%i books processed' % (count)) logger.debug('%i books processed' % (count)) prefix_len = len( os.path.commonprefix([sm.title for sm in dbsession.query(ShelfMark)])) with transaction.manager: for shelfmark in dbsession.query(ShelfMark): shelfmark.title = shelfmark.title[prefix_len:] logger.info('Shelf-marks extracted')
def filter_books(args): settings = get_appsettings(args.configuration) setup_logging(args.configuration) logger = logging.getLogger('explorethestacks') logger.info('Filtering books') engine = engine_from_config(settings, 'sqlalchemy.') DBSession.configure(bind=engine) dbsession = DBSession() count = 0 filter_count = 0 with transaction.manager: for book in dbsession.query(Book): dbsession.add(book) if not book.illustrations: dbsession.delete(book) filter_count = filter_count + 1 count = count + 1 if count % 10000 == 0: logger.debug('%i books processed' % (count)) transaction.commit() logger.info('%i books filtered' % (filter_count))
def load_books(args): settings = get_appsettings(args.configuration) setup_logging(args.configuration) logger = logging.getLogger('explorethestacks') logger.info('Loading books') engine = engine_from_config(settings, 'sqlalchemy.') DBSession.configure(bind=engine) dbsession = DBSession() with transaction.manager: with open(args.source) as f: books = json.load(f) count = 0 for book_data in books: dbsession.add( Book(book_identifier=book_data['identifier'], attrs=book_data)) count = count + 1 if count % 10000 == 0: transaction.commit() logger.debug('%i books loaded' % (count)) logger.info('%i books loaded' % (count))
def load_illustrations(args): settings = get_appsettings(args.configuration) setup_logging(args.configuration) logger = logging.getLogger('explorethestacks') logger.info('Loading illustrations') engine = engine_from_config(settings, 'sqlalchemy.') DBSession.configure(bind=engine) dbsession = DBSession() count = 0 db_book = None for path, _, filenames in os.walk(args.source): for filename in filenames: if not filename.endswith('.tsv'): continue with transaction.manager: with open('%s/%s' % (path, filename)) as f: reader = DictReader(f, dialect='excel-tab') for line in reader: db_book = dbsession.query(Book).filter( Book.book_identifier == line['book_identifier']).first() if db_book: for field in [ 'date', 'page', 'volume', 'image_idx' ]: try: line[field] = int(line[field]) except ValueError: pass illustration = Illustration( flickr_id=line['flickr_id'], attrs=line) db_book.illustrations.append(illustration) dbsession.add(illustration) count = count + 1 if count % 10000 == 0: logger.debug('%i illustrations loaded' % (count)) logger.info('%i illustrations loaded' % (count))
def create_shelves(args): settings = get_appsettings(args.configuration) setup_logging(args.configuration) logger = logging.getLogger('explorethestacks') logger.info('Creating shelves') engine = engine_from_config(settings, 'sqlalchemy.') DBSession.configure(bind=engine) dbsession = DBSession() shelf = None book_count = 0 idx = 0 count = 0 with transaction.manager: dbsession.query(Shelf).delete() with transaction.manager: for shelf_mark in dbsession.query(ShelfMark).order_by(ShelfMark.title): if not shelf: idx = idx + 1 shelf = Shelf(order=idx) dbsession.add(shelf) shelf.shelf_marks.append(shelf_mark) book_count = len(shelf_mark.books) elif book_count + len(shelf_mark.books) > 200: idx = idx + 1 shelf = Shelf(order=idx) dbsession.add(shelf) shelf.shelf_marks.append(shelf_mark) book_count = len(shelf_mark.books) else: shelf.shelf_marks.append(shelf_mark) book_count = book_count + len(shelf_mark.books) count = count + 1 if count % 10000 == 0: dbsession.flush() logger.debug('%s shelfmarks processed' % (count)) logger.debug('%s shelfmarks processed' % (count)) logger.debug('Creating shelf hierarchy') with transaction.manager: while dbsession.query(Shelf).filter( Shelf.parent_id == None).count() > 50: idx = 0 parent_shelf = None child_count = 0 for shelf in dbsession.query(Shelf).filter( Shelf.parent_id == None).order_by(Shelf.order): if not parent_shelf: idx = idx + 1 parent_shelf = Shelf(order=idx) dbsession.add(parent_shelf) shelf.parent = parent_shelf child_count = child_count + 1 elif child_count > 50: idx = idx + 1 parent_shelf = Shelf(order=idx) dbsession.add(parent_shelf) shelf.parent = parent_shelf child_count = 1 else: shelf.parent = parent_shelf child_count = child_count + 1 with transaction.manager: root_shelf = Shelf() dbsession.add(root_shelf) for shelf in dbsession.query(Shelf).filter(Shelf.parent_id == None): if shelf != root_shelf: shelf.parent = root_shelf logger.debug('Creating shelf titles') def create_titles(shelf): if shelf.children: for child in shelf.children: create_titles(child) shelf.start = shelf.children[0].start shelf.end = shelf.children[-1].end elif shelf.shelf_marks: shelf.start = shelf.shelf_marks[0].title shelf.end = shelf.shelf_marks[-1].title with transaction.manager: root_shelf = dbsession.query(Shelf).filter( Shelf.parent_id == None).first() create_titles(root_shelf) root_shelf.start = 'Explore the Stacks' root_shelf.end = 'Explore the Stacks' logger.info('Shelves created')