Example #1
0
def mwCreateNewDump(config={}):
    sys.stderr.write('Trying generating a new dump into a new directory...')
    if config['pages']:
        pagetitles = mwGetPageTitles(config=config)
        wikiteam.savePageTitles(config=config, pagetitles=pagetitles)
        mwGeneratePageDump(config=config, pagetitles=pagetitles)
        mwCheckXMLIntegrity(config=config, pagetitles=pagetitles)
    if config['images']:
        imagenames = mwGetImageNames(config=config)
        mwSaveImageNames(config=config, imagenames=imagenames)
        mwGenerateImageDump(config=config, imagenames=imagenames)
    if config['logs']:
        mwSaveLogs(config=config)
    mwSaveIndexPHP(config=config)
    mwSaveSpecialVersion(config=config)
    mwSaveSiteInfo(config=config)
Example #2
0
def mwCreateNewDump(config={}):
    sys.stderr.write('Trying generating a new dump into a new directory...')
    if config['pages']:
        pagetitles = mwGetPageTitles(config=config)
        wikiteam.savePageTitles(config=config, pagetitles=pagetitles)
        mwGeneratePageDump(config=config, pagetitles=pagetitles)
        mwCheckXMLIntegrity(config=config, pagetitles=pagetitles)
    if config['images']:
        imagenames = mwGetImageNames(config=config)
        mwSaveImageNames(config=config, imagenames=imagenames)
        mwGenerateImageDump(config=config, imagenames=imagenames)
    if config['logs']:
        mwSaveLogs(config=config)
    mwSaveIndexPHP(config=config)
    mwSaveSpecialVersion(config=config)
    mwSaveSiteInfo(config=config)
Example #3
0
def mwResumePreviousDump(config={}):
    imagenames = []
    sys.stderr.write('Resuming previous dump process...')
    if config['xml']:
        pagetitles = mwReadPageTitles(config=config)
        try:
            lasttitles = wikiteam.reverseReadline('%s/%s-%s-titles.txt' % (config['path'], domain2prefix(config=config), config['date']))
            lasttitle=lasttitles.next()
            if lasttitle == '':
                lasttitle=lasttitles.next()
        except:
            pass  # probably file does not exists
        if lasttitle == '--END--':
            # titles list is complete
            sys.stderr.write('Title list was completed in the previous session')
        else:
            sys.stderr.write('Title list is incomplete. Reloading...')
            # do not resume, reload, to avoid inconsistences, deleted pages or
            # so
            pagetitles = mwGetPageTitles(config=config, start=lastxmltitle)
            wikiteam.savePageTitles(config=config, pagetitles=pagetitles)

        # checking xml dump
        xmliscomplete = False
        lastxmltitle = None
        try:
            f = wikiteam.reverseReadline(
                '%s/%s-%s-%s.xml' %
                (config['path'],
                 domain2prefix(
                    config=config),
                    config['date'],
                    config['curonly'] and 'current' or 'history'),
                )
            for l in f:
                if l == '</mediawiki>':
                    # xml dump is complete
                    xmliscomplete = True
                    break

                xmltitle = re.search(r'<title>([^<]+)</title>', l)
                if xmltitle:
                    lastxmltitle = wikiteam.undoHTMLEntities(text=xmltitle.group(1))
                    break
        except:
            pass  # probably file does not exists

        if xmliscomplete:
            sys.stderr.write('XML dump was completed in the previous session')
        elif lastxmltitle:
            # resuming...
            sys.stderr.write('Resuming XML dump from "%s"' % (lastxmltitle))
            pagetitles = mwReadPageTitles(config=config, start=lastxmltitle)
            mwGenerateXMLDump(
                config=config,
                pagetitles=pagetitles,
                start=lastxmltitle)
        else:
            # corrupt? only has XML header?
            sys.stderr.write('XML is corrupt? Regenerating...')
            pagetitles = mwReadPageTitles(config=config)
            mwGenerateXMLDump(config=config, pagetitles=pagetitles)

    if config['images']:
        # load images
        lastimage = ''
        try:
            f = open('%s/%s-%s-images.txt' % (config['path'], domain2prefix(config=config), config['date']), 'r')
            raw = f.read().strip()
            lines = raw.split('\n')
            for l in lines:
                if re.search(r'\t', l):
                    imagenames.append(l.split('\t'))
            lastimage = lines[-1]
            f.close()
        except:
            pass  # probably file doesnot exists
        if lastimage == '--END--':
            sys.stderr.write('Image list was completed in the previous session')
        else:
            sys.stderr.write('Image list is incomplete. Reloading...')
            # do not resume, reload, to avoid inconsistences, deleted images or
            # so
            imagenames = mwGetImageNames(config=config)
            saveImageNames(config=config, imagenames=imagenames)
        # checking images directory
        listdir = []
        try:
            listdir = [n.decode('utf-8') for n in os.listdir('%s/images' % (config['path']))]
        except:
            pass  # probably directory does not exist
        listdir.sort()
        complete = True
        lastfilename = ''
        lastfilename2 = ''
        c = 0
        for filename, url, uploader in imagenames:
            lastfilename2 = lastfilename
            # return always the complete filename, not the truncated
            lastfilename = filename
            filename2 = filename
            if len(filename2) > other['filenamelimit']:
                filename2 = truncateFilename(other=other, filename=filename2)
            if filename2 not in listdir:
                complete = False
                break
            c += 1
        sys.stderr.write('%d images were found in the directory from a previous session' % (c))
        if complete:
            # image dump is complete
            sys.stderr.write('Image dump was completed in the previous session')
        else:
            # we resume from previous image, which may be corrupted (or missing
            # .desc)  by the previous session ctrl-c or abort
            mwGenerateImageDump(config=config, imagenames=imagenames, start=lastfilename2)

    if config['logs']:
        # fix
        pass

    mwSaveIndexPHP(config=config)
    mwSaveSpecialVersion(config=config)
    mwSaveSiteInfo(config=config)
Example #4
0
def mwResumePreviousDump(config={}):
    imagenames = []
    sys.stderr.write('Resuming previous dump process...')
    if config['xml']:
        pagetitles = mwReadPageTitles(config=config)
        try:
            lasttitles = wikiteam.reverseReadline(
                '%s/%s-%s-titles.txt' %
                (config['path'], domain2prefix(config=config), config['date']))
            lasttitle = lasttitles.next()
            if lasttitle == '':
                lasttitle = lasttitles.next()
        except:
            pass  # probably file does not exists
        if lasttitle == '--END--':
            # titles list is complete
            sys.stderr.write(
                'Title list was completed in the previous session')
        else:
            sys.stderr.write('Title list is incomplete. Reloading...')
            # do not resume, reload, to avoid inconsistences, deleted pages or
            # so
            pagetitles = mwGetPageTitles(config=config, start=lastxmltitle)
            wikiteam.savePageTitles(config=config, pagetitles=pagetitles)

        # checking xml dump
        xmliscomplete = False
        lastxmltitle = None
        try:
            f = wikiteam.reverseReadline(
                '%s/%s-%s-%s.xml' %
                (config['path'], domain2prefix(config=config), config['date'],
                 config['curonly'] and 'current' or 'history'), )
            for l in f:
                if l == '</mediawiki>':
                    # xml dump is complete
                    xmliscomplete = True
                    break

                xmltitle = re.search(r'<title>([^<]+)</title>', l)
                if xmltitle:
                    lastxmltitle = wikiteam.undoHTMLEntities(
                        text=xmltitle.group(1))
                    break
        except:
            pass  # probably file does not exists

        if xmliscomplete:
            sys.stderr.write('XML dump was completed in the previous session')
        elif lastxmltitle:
            # resuming...
            sys.stderr.write('Resuming XML dump from "%s"' % (lastxmltitle))
            pagetitles = mwReadPageTitles(config=config, start=lastxmltitle)
            mwGenerateXMLDump(config=config,
                              pagetitles=pagetitles,
                              start=lastxmltitle)
        else:
            # corrupt? only has XML header?
            sys.stderr.write('XML is corrupt? Regenerating...')
            pagetitles = mwReadPageTitles(config=config)
            mwGenerateXMLDump(config=config, pagetitles=pagetitles)

    if config['images']:
        # load images
        lastimage = ''
        try:
            f = open(
                '%s/%s-%s-images.txt' %
                (config['path'], domain2prefix(config=config), config['date']),
                'r')
            raw = f.read().strip()
            lines = raw.split('\n')
            for l in lines:
                if re.search(r'\t', l):
                    imagenames.append(l.split('\t'))
            lastimage = lines[-1]
            f.close()
        except:
            pass  # probably file doesnot exists
        if lastimage == '--END--':
            sys.stderr.write(
                'Image list was completed in the previous session')
        else:
            sys.stderr.write('Image list is incomplete. Reloading...')
            # do not resume, reload, to avoid inconsistences, deleted images or
            # so
            imagenames = mwGetImageNames(config=config)
            saveImageNames(config=config, imagenames=imagenames)
        # checking images directory
        listdir = []
        try:
            listdir = [
                n.decode('utf-8')
                for n in os.listdir('%s/images' % (config['path']))
            ]
        except:
            pass  # probably directory does not exist
        listdir.sort()
        complete = True
        lastfilename = ''
        lastfilename2 = ''
        c = 0
        for filename, url, uploader in imagenames:
            lastfilename2 = lastfilename
            # return always the complete filename, not the truncated
            lastfilename = filename
            filename2 = filename
            if len(filename2) > other['filenamelimit']:
                filename2 = truncateFilename(other=other, filename=filename2)
            if filename2 not in listdir:
                complete = False
                break
            c += 1
        sys.stderr.write(
            '%d images were found in the directory from a previous session' %
            (c))
        if complete:
            # image dump is complete
            sys.stderr.write(
                'Image dump was completed in the previous session')
        else:
            # we resume from previous image, which may be corrupted (or missing
            # .desc)  by the previous session ctrl-c or abort
            mwGenerateImageDump(config=config,
                                imagenames=imagenames,
                                start=lastfilename2)

    if config['logs']:
        # fix
        pass

    mwSaveIndexPHP(config=config)
    mwSaveSpecialVersion(config=config)
    mwSaveSiteInfo(config=config)