def mwCreateNewDump(config={}): sys.stderr.write('Trying generating a new dump into a new directory...') if config['pages']: pagetitles = mwGetPageTitles(config=config) wikiteam.savePageTitles(config=config, pagetitles=pagetitles) mwGeneratePageDump(config=config, pagetitles=pagetitles) mwCheckXMLIntegrity(config=config, pagetitles=pagetitles) if config['images']: imagenames = mwGetImageNames(config=config) mwSaveImageNames(config=config, imagenames=imagenames) mwGenerateImageDump(config=config, imagenames=imagenames) if config['logs']: mwSaveLogs(config=config) mwSaveIndexPHP(config=config) mwSaveSpecialVersion(config=config) mwSaveSiteInfo(config=config)
def mwResumePreviousDump(config={}): imagenames = [] sys.stderr.write('Resuming previous dump process...') if config['xml']: pagetitles = mwReadPageTitles(config=config) try: lasttitles = wikiteam.reverseReadline('%s/%s-%s-titles.txt' % (config['path'], domain2prefix(config=config), config['date'])) lasttitle=lasttitles.next() if lasttitle == '': lasttitle=lasttitles.next() except: pass # probably file does not exists if lasttitle == '--END--': # titles list is complete sys.stderr.write('Title list was completed in the previous session') else: sys.stderr.write('Title list is incomplete. Reloading...') # do not resume, reload, to avoid inconsistences, deleted pages or # so pagetitles = mwGetPageTitles(config=config, start=lastxmltitle) wikiteam.savePageTitles(config=config, pagetitles=pagetitles) # checking xml dump xmliscomplete = False lastxmltitle = None try: f = wikiteam.reverseReadline( '%s/%s-%s-%s.xml' % (config['path'], domain2prefix( config=config), config['date'], config['curonly'] and 'current' or 'history'), ) for l in f: if l == '</mediawiki>': # xml dump is complete xmliscomplete = True break xmltitle = re.search(r'<title>([^<]+)</title>', l) if xmltitle: lastxmltitle = wikiteam.undoHTMLEntities(text=xmltitle.group(1)) break except: pass # probably file does not exists if xmliscomplete: sys.stderr.write('XML dump was completed in the previous session') elif lastxmltitle: # resuming... sys.stderr.write('Resuming XML dump from "%s"' % (lastxmltitle)) pagetitles = mwReadPageTitles(config=config, start=lastxmltitle) mwGenerateXMLDump( config=config, pagetitles=pagetitles, start=lastxmltitle) else: # corrupt? only has XML header? sys.stderr.write('XML is corrupt? Regenerating...') pagetitles = mwReadPageTitles(config=config) mwGenerateXMLDump(config=config, pagetitles=pagetitles) if config['images']: # load images lastimage = '' try: f = open('%s/%s-%s-images.txt' % (config['path'], domain2prefix(config=config), config['date']), 'r') raw = f.read().strip() lines = raw.split('\n') for l in lines: if re.search(r'\t', l): imagenames.append(l.split('\t')) lastimage = lines[-1] f.close() except: pass # probably file doesnot exists if lastimage == '--END--': sys.stderr.write('Image list was completed in the previous session') else: sys.stderr.write('Image list is incomplete. Reloading...') # do not resume, reload, to avoid inconsistences, deleted images or # so imagenames = mwGetImageNames(config=config) saveImageNames(config=config, imagenames=imagenames) # checking images directory listdir = [] try: listdir = [n.decode('utf-8') for n in os.listdir('%s/images' % (config['path']))] except: pass # probably directory does not exist listdir.sort() complete = True lastfilename = '' lastfilename2 = '' c = 0 for filename, url, uploader in imagenames: lastfilename2 = lastfilename # return always the complete filename, not the truncated lastfilename = filename filename2 = filename if len(filename2) > other['filenamelimit']: filename2 = truncateFilename(other=other, filename=filename2) if filename2 not in listdir: complete = False break c += 1 sys.stderr.write('%d images were found in the directory from a previous session' % (c)) if complete: # image dump is complete sys.stderr.write('Image dump was completed in the previous session') else: # we resume from previous image, which may be corrupted (or missing # .desc) by the previous session ctrl-c or abort mwGenerateImageDump(config=config, imagenames=imagenames, start=lastfilename2) if config['logs']: # fix pass mwSaveIndexPHP(config=config) mwSaveSpecialVersion(config=config) mwSaveSiteInfo(config=config)
def mwResumePreviousDump(config={}): imagenames = [] sys.stderr.write('Resuming previous dump process...') if config['xml']: pagetitles = mwReadPageTitles(config=config) try: lasttitles = wikiteam.reverseReadline( '%s/%s-%s-titles.txt' % (config['path'], domain2prefix(config=config), config['date'])) lasttitle = lasttitles.next() if lasttitle == '': lasttitle = lasttitles.next() except: pass # probably file does not exists if lasttitle == '--END--': # titles list is complete sys.stderr.write( 'Title list was completed in the previous session') else: sys.stderr.write('Title list is incomplete. Reloading...') # do not resume, reload, to avoid inconsistences, deleted pages or # so pagetitles = mwGetPageTitles(config=config, start=lastxmltitle) wikiteam.savePageTitles(config=config, pagetitles=pagetitles) # checking xml dump xmliscomplete = False lastxmltitle = None try: f = wikiteam.reverseReadline( '%s/%s-%s-%s.xml' % (config['path'], domain2prefix(config=config), config['date'], config['curonly'] and 'current' or 'history'), ) for l in f: if l == '</mediawiki>': # xml dump is complete xmliscomplete = True break xmltitle = re.search(r'<title>([^<]+)</title>', l) if xmltitle: lastxmltitle = wikiteam.undoHTMLEntities( text=xmltitle.group(1)) break except: pass # probably file does not exists if xmliscomplete: sys.stderr.write('XML dump was completed in the previous session') elif lastxmltitle: # resuming... sys.stderr.write('Resuming XML dump from "%s"' % (lastxmltitle)) pagetitles = mwReadPageTitles(config=config, start=lastxmltitle) mwGenerateXMLDump(config=config, pagetitles=pagetitles, start=lastxmltitle) else: # corrupt? only has XML header? sys.stderr.write('XML is corrupt? Regenerating...') pagetitles = mwReadPageTitles(config=config) mwGenerateXMLDump(config=config, pagetitles=pagetitles) if config['images']: # load images lastimage = '' try: f = open( '%s/%s-%s-images.txt' % (config['path'], domain2prefix(config=config), config['date']), 'r') raw = f.read().strip() lines = raw.split('\n') for l in lines: if re.search(r'\t', l): imagenames.append(l.split('\t')) lastimage = lines[-1] f.close() except: pass # probably file doesnot exists if lastimage == '--END--': sys.stderr.write( 'Image list was completed in the previous session') else: sys.stderr.write('Image list is incomplete. Reloading...') # do not resume, reload, to avoid inconsistences, deleted images or # so imagenames = mwGetImageNames(config=config) saveImageNames(config=config, imagenames=imagenames) # checking images directory listdir = [] try: listdir = [ n.decode('utf-8') for n in os.listdir('%s/images' % (config['path'])) ] except: pass # probably directory does not exist listdir.sort() complete = True lastfilename = '' lastfilename2 = '' c = 0 for filename, url, uploader in imagenames: lastfilename2 = lastfilename # return always the complete filename, not the truncated lastfilename = filename filename2 = filename if len(filename2) > other['filenamelimit']: filename2 = truncateFilename(other=other, filename=filename2) if filename2 not in listdir: complete = False break c += 1 sys.stderr.write( '%d images were found in the directory from a previous session' % (c)) if complete: # image dump is complete sys.stderr.write( 'Image dump was completed in the previous session') else: # we resume from previous image, which may be corrupted (or missing # .desc) by the previous session ctrl-c or abort mwGenerateImageDump(config=config, imagenames=imagenames, start=lastfilename2) if config['logs']: # fix pass mwSaveIndexPHP(config=config) mwSaveSpecialVersion(config=config) mwSaveSiteInfo(config=config)