def main(): argparser = argparse.ArgumentParser() argparser.add_argument('-v', '--verbose', action='store_true') argparser.add_argument('-i', '--input', dest="file", required=True, help='A Commons dump file (bz2 compressed)') argparser.add_argument('-c', '--count', dest="count", type=int, help='Maximum number of object to add to database') args = argparser.parse_args() filename = args.file if args.count: maxworks = args.count else: maxworks = -1 reader = bz2.BZ2File(filename, 'r') dump = etree.iterparse(reader, events=('end',)) db_session = db.open_session() count = 0 for event, elem in dump: if elem.tag == '{http://www.mediawiki.org/xml/export-0.9/}page' and elem.findtext('.//{http://www.mediawiki.org/xml/export-0.9/}title').startswith('File:'): filename = elem.findtext('.//{http://www.mediawiki.org/xml/export-0.9/}title') if elem.find('.//{http://www.mediawiki.org/xml/export-0.9/}redirect') is not None: continue # Duplicate work try: work_record = db.Work("wmc", filename) db_session.add(work_record) db_session.commit() except IntegrityError: # already inserted db_session.rollback() count += 1 if args.verbose and count % 10000 == 0: print('processed records: {}'.format(count)) if count == maxworks: break elem.clear() while elem.getprevious() is not None: del elem.getparent()[0] print('processed records: {}'.format(count))
def main(): try: max_tasks = int(sys.argv[1]) except (IndexError, TypeError): sys.exit('Usage: {} NUM_TASKS'.format(sys.argv[0])) session = db.open_session() count = 0 diff = 0 while max_tasks > 0: # Find the next bunch of unqueued works, stmt = select([db.Work.id]).where( db.Work.status=='loaded' ).limit(min(max_tasks, 50)) work_ids = [row[0] for row in session.execute(stmt).fetchall()] if not work_ids: print('No more unqueued works in the database') return max_tasks -= len(work_ids) count += len(work_ids) diff += len(work_ids) if diff >= 10000: print('Queued works: {}'.format(count)) diff = 0 # We expect that this job is not run in parallel, so we don't # have to worry about the status changing under our feet try: stmt = db.Work.__table__.update().where( db.Work.id.in_(work_ids) ).values(status='queued') session.execute(stmt) wmc.process.apply_async((work_ids, )) except: session.rollback() raise else: session.commit()
def main(): try: max_tasks = int(sys.argv[1]) except (IndexError, TypeError): sys.exit('Usage: {} NUM_TASKS'.format(sys.argv[0])) session = db.open_session() count = 0 diff = 0 while max_tasks > 0: # Find the next bunch of unqueued works, stmt = select([db.Work.id]).where(db.Work.status == 'loaded').limit( min(max_tasks, 50)) work_ids = [row[0] for row in session.execute(stmt).fetchall()] if not work_ids: print('No more unqueued works in the database') return max_tasks -= len(work_ids) count += len(work_ids) diff += len(work_ids) if diff >= 10000: print('Queued works: {}'.format(count)) diff = 0 # We expect that this job is not run in parallel, so we don't # have to worry about the status changing under our feet try: stmt = db.Work.__table__.update().where( db.Work.id.in_(work_ids)).values(status='queued') session.execute(stmt) wmc.process.apply_async((work_ids, )) except: session.rollback() raise else: session.commit()
def __init__(self): """ Constructor sets up logging """ self.logger = logger.Logger(logfilepath = "/var/log/virt-factory/taskatron.log").logger self.session = db.open_session()
def main(): argparser = argparse.ArgumentParser() argparser.add_argument('-v', '--verbose', action='store_true') argparser.add_argument('-i', '--input', dest="file", required=True, help='A Commons dump file (bz2 compressed)') argparser.add_argument('-c', '--count', dest="count", type=int, help='Maximum number of object to add to database') args = argparser.parse_args() filename = args.file if args.count: maxworks = args.count else: maxworks = -1 reader = bz2.BZ2File(filename, 'r') dump = etree.iterparse(reader, events=('end', )) db_session = db.open_session() count = 0 for event, elem in dump: if elem.tag == '{http://www.mediawiki.org/xml/export-0.9/}page' and elem.findtext( './/{http://www.mediawiki.org/xml/export-0.9/}title' ).startswith('File:'): filename = elem.findtext( './/{http://www.mediawiki.org/xml/export-0.9/}title') if elem.find( './/{http://www.mediawiki.org/xml/export-0.9/}redirect' ) is not None: continue # Duplicate work try: work_record = db.Work("wmc", filename) db_session.add(work_record) db_session.commit() except IntegrityError: # already inserted db_session.rollback() count += 1 if args.verbose and count % 10000 == 0: print('processed records: {}'.format(count)) if count == maxworks: break elem.clear() while elem.getprevious() is not None: del elem.getparent()[0] print('processed records: {}'.format(count))
def db(self): if self._db is None: self._db = db.open_session() return self._db
def main(): argparser = argparse.ArgumentParser() argparser.add_argument('-l', '--limit', dest="limit", type=int) argparser.add_argument('-o', '--offset', dest="offset", type=int) argparser.add_argument('--compress', dest='compress', action='store_true', help='Compress output using LZMA') argparser.add_argument('outfile') args = argparser.parse_args() filename = args.outfile if args.compress: outfile = lzma.open(filename, 'w') else: outfile = open(filename, 'w') error_filename = 'errors_export_' + datetime.now().isoformat() + '.txt' error_file = None db_session = db.open_session() # Find the next bunch of finished works stmt = select([db.Work]).where( (db.Work.status == 'done') & (db.Work.hashm4 != None) # TODO: use 'hash' when hashm4 is renamed back ) if args.limit: stmt = stmt.limit(args.limit) if args.offset: stmt = stmt.offset(args.offset) works = db_session.connection().execution_options(stream_results=True).execute(stmt) count = 0 for work in works: if work.handler == 'wmc': try: pkg = wmc.export_work(work) except RuntimeError as e: print('Error exporting work {0}. Work written to error file'.format(count)) if error_file is None: error_file = open(error_filename, 'w') error_file.write(str(work)) error_file.write('\n') continue else: raise RuntimeError('Unknown work handler: %s' % work.handler) if args.compress: outfile.write(bytes(json.dumps(pkg), 'utf-8')) outfile.write(bytes('\n', 'utf-8')) else: outfile.write(json.dumps(pkg)) outfile.write('\n') count += 1 print('processed records: {}'.format(count)) outfile.close() if error_file is not None: error_file.close()