def getrecentchangeschunk(self): urldata = { 'action':'query', 'list':'recentchanges', 'rcprop':'timestamp|title', 'format':'xml', 'rcdir':'newer', 'rcstart':self.tmpAllTS['recentchanges'], 'continue':self.dummycontinue, 'rclimit':self.limit, 'maxlag':self.maxlag } url = self.siteurl + 'w/api.php?%s' % urllib.parse.urlencode(urldata) ok, alldata = toolsfornet.downloadHTML(url) #print(type(alldata)) #print(len(alldata)) root = ET.fromstring(alldata) continueTime = '' for b in root.iter('continue'): continueTime = b.attrib['rccontinue'].split('|')[0] state = '' for onerev in root.iter('rc'): self.allChangedTitlesSet.add(onerev.attrib['title']) if continueTime == '': self.tmpAllTS['recentchanges'] = time.strftime("%Y%m%d%H%M%S", time.gmtime()) state = self.allfoundstr else: self.tmpAllTS['recentchanges'] = continueTime #print(continueTime, 'continueTime') print('Titles changed after getrecentchangeschunk', len(self.allChangedTitlesSet)) return state
def getlechangeschunk(self,whichle): urldata = { 'action':'query', 'list':'logevents', 'letype':whichle, 'leprop':'title|type|timestamp', 'format':'xml', 'ledir':'newer', 'lestart':self.tmpAllTS[whichle], 'continue':self.dummycontinue, 'lelimit':self.limit, 'maxlag':self.maxlag } url = self.siteurl + 'w/api.php?%s' % urllib.parse.urlencode(urldata) ok, alldata = toolsfornet.downloadHTML(url) root = ET.fromstring(alldata) continueTime = '' for b in root.iter('continue'): continueTime = b.attrib['lecontinue'].split('|')[0] state = '' for onerev in root.iter('rc'): self.allChangedTitlesSet.add(onerev.attrib['title']) if continueTime == '': self.tmpAllTS[whichle] = time.strftime("%Y%m%d%H%M%S", time.gmtime()) state = self.allfoundstr else: self.tmpAllTS[whichle] = continueTime print('Titles changed getlechangeschunk getmoveschunk', len(self.allChangedTitlesSet), whichle) return state
def getmoveschunk(self): urldata = { 'action':'query', 'list':'logevents', 'letype':'move', 'leprop':'title|type|timestamp|details', 'format':'xml', 'ledir':'newer', 'lestart':self.tmpAllTS['move'], 'continue':self.dummycontinue, 'lelimit':self.limit, 'maxlag':self.maxlag } url = self.siteurl + 'w/api.php?%s' % urllib.parse.urlencode(urldata) ok, alldata = toolsfornet.downloadHTML(url) root = ET.fromstring(alldata) continueTime = '' for b in root.iter('continue'): continueTime = b.attrib['lecontinue'].split('|')[0] state = '' for onerev in root.iter('logevents'): for item in root.iter('item'): self.allChangedTitlesSet.add(item.attrib['title']) for param in item.iter('params'): self.allChangedTitlesSet.add(param.attrib['target_title']) #print('MOVED ONE =======================================') if continueTime == '': self.tmpAllTS['move'] = time.strftime("%Y%m%d%H%M%S", time.gmtime()) state = self.allfoundstr else: self.tmpAllTS['move'] = continueTime print('Titles changed after getmoveschunk', len(self.allChangedTitlesSet)) return state
def downloadlatestdump(projectname,dumpsfolder): print('dumpsfolder',dumpsfolder) ok, alldata = toolsfornet.downloadHTML(urllib.parse.urljoin(urlofdumpslist, 'backup-index.html')) if not ok:return href = getLinkToProjectDumpsFromHTML(alldata,projectname) ok, alldata = toolsfornet.downloadHTML(urllib.parse.urljoin(urlofdumpslist, href)) #print(urllib.parse.urljoin(urlofdumpslist, href)) if not ok:return href = getLinkToProjectCurrentDumpFromHTML(alldata) #print('downloadfrom',href) #print(urllib.parse.urljoin(urlofdumpslist, href)) #print('dumpsfolder2',dumpsfolder) #print(os.path.join(dumpsfolder,href)) if href =='':return #print('downloading dump...') #print('dumpsfolder',dumpsfolder) #print('href',href) bz2name = href.split('/')[-1] #print('bz2name',bz2name) #print (os.path.join(dumpsfolder,bz2name)) #print('url from', urllib.parse.urljoin(urlofdumpslist, href)) #return b = toolsfornet.downloadADump(urllib.parse.urljoin(urlofdumpslist, href),os.path.join(dumpsfolder,bz2name)) print('dump downloaded or not')
def getAChunkOfNewRevisions(self): self.workinglemmas = self.allChangedTitles[:MAX_PAGES_TO_GET_IN_A_CHUNK] urldata = { 'title':'Special:Export', 'pages':'\n'.join(self.workinglemmas), 'action':'submit', 'curonly':'1', 'wpDownload':'1' } url = self.siteurl + 'w/index.php?%s' % urllib.parse.urlencode(urldata) triescounter = 0 print('downloading next chunk...') while True: ok, xmlfromnet = toolsfornet.downloadHTML(url) if not ok: triescounter += 1 time.sleep(2) if triescounter > MAX_TRIES: return None else: return xmlfromnet
def getProjectLatestDumpInfoFromNet(projectname): ok, alldata = toolsfornet.downloadHTML(urllib.parse.urljoin(urlofdumpslist, 'backup-index.html'))