def main():
    argparser = argparse.ArgumentParser()

    argparser.add_argument('-v', '--verbose', action='store_true')
    argparser.add_argument('-i', '--input', dest="file", required=True, help='A Commons dump file (bz2 compressed)')
    argparser.add_argument('-c', '--count', dest="count", type=int, help='Maximum number of object to add to database')

    args = argparser.parse_args()

    filename = args.file

    if args.count:
        maxworks = args.count
    else:
        maxworks = -1

    reader = bz2.BZ2File(filename, 'r')
    dump = etree.iterparse(reader, events=('end',))

    db_session = db.open_session()

    count = 0

    for event, elem in dump:
        if elem.tag == '{http://www.mediawiki.org/xml/export-0.9/}page' and elem.findtext('.//{http://www.mediawiki.org/xml/export-0.9/}title').startswith('File:'):
            filename = elem.findtext('.//{http://www.mediawiki.org/xml/export-0.9/}title')
            if elem.find('.//{http://www.mediawiki.org/xml/export-0.9/}redirect') is not None:
                continue  # Duplicate work

            try:
                work_record = db.Work("wmc", filename)
                db_session.add(work_record)
                db_session.commit()
            except IntegrityError:
                # already inserted
                db_session.rollback()
            
            count += 1
            if args.verbose and count % 10000 == 0:
                print('processed records: {}'.format(count))

            if count == maxworks:
                break

            elem.clear()
            while elem.getprevious() is not None:
                del elem.getparent()[0]

    print('processed records: {}'.format(count))
def main():
    try:
        max_tasks = int(sys.argv[1])
    except (IndexError, TypeError):
        sys.exit('Usage: {} NUM_TASKS'.format(sys.argv[0]))

    session = db.open_session()

    count = 0
    diff = 0
    
    while max_tasks > 0:
        # Find the next bunch of unqueued works,
        stmt = select([db.Work.id]).where(
            db.Work.status=='loaded'
        ).limit(min(max_tasks, 50))
        work_ids = [row[0] for row in session.execute(stmt).fetchall()]

        if not work_ids:
            print('No more unqueued works in the database')
            return

        max_tasks -= len(work_ids)
        count += len(work_ids)
        diff += len(work_ids)

        if diff >= 10000:
            print('Queued works: {}'.format(count))
            diff = 0

        # We expect that this job is not run in parallel, so we don't
        # have to worry about the status changing under our feet
        try:
            stmt = db.Work.__table__.update().where(
                db.Work.id.in_(work_ids)
            ).values(status='queued')
            session.execute(stmt)

            wmc.process.apply_async((work_ids, ))
        except:
            session.rollback()
            raise
        else:
            session.commit()
Exemple #3
0
def main():
    try:
        max_tasks = int(sys.argv[1])
    except (IndexError, TypeError):
        sys.exit('Usage: {} NUM_TASKS'.format(sys.argv[0]))

    session = db.open_session()

    count = 0
    diff = 0

    while max_tasks > 0:
        # Find the next bunch of unqueued works,
        stmt = select([db.Work.id]).where(db.Work.status == 'loaded').limit(
            min(max_tasks, 50))
        work_ids = [row[0] for row in session.execute(stmt).fetchall()]

        if not work_ids:
            print('No more unqueued works in the database')
            return

        max_tasks -= len(work_ids)
        count += len(work_ids)
        diff += len(work_ids)

        if diff >= 10000:
            print('Queued works: {}'.format(count))
            diff = 0

        # We expect that this job is not run in parallel, so we don't
        # have to worry about the status changing under our feet
        try:
            stmt = db.Work.__table__.update().where(
                db.Work.id.in_(work_ids)).values(status='queued')
            session.execute(stmt)

            wmc.process.apply_async((work_ids, ))
        except:
            session.rollback()
            raise
        else:
            session.commit()
Exemple #4
0
 def __init__(self):
     """
     Constructor sets up logging
     """
     self.logger = logger.Logger(logfilepath = "/var/log/virt-factory/taskatron.log").logger
     self.session = db.open_session()
Exemple #5
0
def main():
    argparser = argparse.ArgumentParser()

    argparser.add_argument('-v', '--verbose', action='store_true')
    argparser.add_argument('-i',
                           '--input',
                           dest="file",
                           required=True,
                           help='A Commons dump file (bz2 compressed)')
    argparser.add_argument('-c',
                           '--count',
                           dest="count",
                           type=int,
                           help='Maximum number of object to add to database')

    args = argparser.parse_args()

    filename = args.file

    if args.count:
        maxworks = args.count
    else:
        maxworks = -1

    reader = bz2.BZ2File(filename, 'r')
    dump = etree.iterparse(reader, events=('end', ))

    db_session = db.open_session()

    count = 0

    for event, elem in dump:
        if elem.tag == '{http://www.mediawiki.org/xml/export-0.9/}page' and elem.findtext(
                './/{http://www.mediawiki.org/xml/export-0.9/}title'
        ).startswith('File:'):
            filename = elem.findtext(
                './/{http://www.mediawiki.org/xml/export-0.9/}title')
            if elem.find(
                    './/{http://www.mediawiki.org/xml/export-0.9/}redirect'
            ) is not None:
                continue  # Duplicate work

            try:
                work_record = db.Work("wmc", filename)
                db_session.add(work_record)
                db_session.commit()
            except IntegrityError:
                # already inserted
                db_session.rollback()

            count += 1
            if args.verbose and count % 10000 == 0:
                print('processed records: {}'.format(count))

            if count == maxworks:
                break

            elem.clear()
            while elem.getprevious() is not None:
                del elem.getparent()[0]

    print('processed records: {}'.format(count))
 def db(self):
     if self._db is None:
         self._db = db.open_session()
     return self._db
Exemple #7
0
 def db(self):
     if self._db is None:
         self._db = db.open_session()
     return self._db
Exemple #8
0
def main():
    argparser = argparse.ArgumentParser()
    argparser.add_argument('-l', '--limit', dest="limit", type=int)
    argparser.add_argument('-o', '--offset', dest="offset", type=int)
    argparser.add_argument('--compress', dest='compress', action='store_true', help='Compress output using LZMA')
    argparser.add_argument('outfile')
    args = argparser.parse_args()

    filename = args.outfile

    if args.compress:
        outfile = lzma.open(filename, 'w')
    else:
        outfile = open(filename, 'w')

    error_filename = 'errors_export_' + datetime.now().isoformat() + '.txt'
    error_file = None

    db_session = db.open_session()

    # Find the next bunch of finished works
    stmt = select([db.Work]).where(
        (db.Work.status == 'done') &
        (db.Work.hashm4 != None) # TODO: use 'hash' when hashm4 is renamed back
    )

    if args.limit:
        stmt = stmt.limit(args.limit)
    if args.offset:
        stmt = stmt.offset(args.offset)
    works = db_session.connection().execution_options(stream_results=True).execute(stmt)

    count = 0

    for work in works:
        if work.handler == 'wmc':
            try:
                pkg = wmc.export_work(work)
            except RuntimeError as e:
                print('Error exporting work {0}. Work written to error file'.format(count))
                if error_file is None:
                    error_file = open(error_filename, 'w')
                error_file.write(str(work))
                error_file.write('\n')
                continue
        else:
            raise RuntimeError('Unknown work handler: %s' % work.handler)

        if args.compress:
            outfile.write(bytes(json.dumps(pkg), 'utf-8'))
            outfile.write(bytes('\n', 'utf-8'))
        else:
            outfile.write(json.dumps(pkg))
            outfile.write('\n')

        count += 1

    print('processed records: {}'.format(count))

    outfile.close()
    if error_file is not None:
        error_file.close()