forked from dantman/pywikia
-
Notifications
You must be signed in to change notification settings - Fork 0
/
GE-ExportImport-Bot.py
112 lines (100 loc) · 3.53 KB
/
GE-ExportImport-Bot.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
This bot is used to Export pages from Wikipedia, alter them, then Import them to another wiki.
Info: http://en.anime.wikia.com/wiki/Project:Bots/ExportImport
"""
import sys, re
import wikipedia, pagegenerators, catlib, config
from time import *
import xml
import xml.dom.minidom as minidom
from xml.dom.minidom import Node
class GEExport:
def __init__(self, pageGenerator):
self.pageGenerator = pageGenerator
def exportPage(self, page):
response = None
data = None
wp = wikipedia.getSite(code=u'en', fam=u'wikipedia')
address = wp.export_address()
title = page.sectionFreeTitle().encode(wp.encoding())
predata = {
'action': 'submit',
'pages': title,
'offset': '1',
}
#if True is True:#Future Loop marker
while True:
wikipedia.get_throttle()
wikipedia.output('\03{lightpurple}>>\03{default} \03{lightaqua}Exporting revisions.\03{default}')
# Now make the actual request to the server
now = time()
if wp.hostname() in config.authenticate.keys():
predata["Content-type"] = "application/x-www-form-urlencoded"
predata["User-agent"] = wikipedia.useragent
data = wp.urlEncode(predata)
response = urllib2.urlopen(urllib2.Request(wp.protocol() + '://' + wp.hostname() + address, data))
data = response.read()
else:
response, data = wp.postForm(address, predata)
data = data.encode(wp.encoding())
wikipedia.get_throttle.setDelay(time() - now)
doc = minidom.parseString(data)
revs = doc.getElementsByTagName('revision')
revCount = len(revs)
if revCount > 0:
lastRev = revs[len(revs)-1].getElementsByTagName('timestamp')[0]
timestamp = ''
for nodes in lastRev.childNodes:
if nodes.nodeType == Node.TEXT_NODE:
timestamp += nodes.data
wikipedia.output('\03{lightpurple}>>\03{default} \03{lightaqua}Got %s revisions up to %s.\03{default}' % (revCount,timestamp))
fileName = 'wpdumps/%s-%s.xml' % (title.replace('/','-'),predata['offset'].replace(':','-'))
wikipedia.output('\03{lightpurple}>>\03{default} \03{lightblue}Saving to %s.\03{default}' % fileName)
f = open(fileName, 'w')
f.write(data)
f.close()
predata['offset'] = timestamp
else:
wikipedia.output('\03{lightpurple}>>\03{default} \03{lightaqua}Returned no revisions, exporting for this page is complete.\03{default}')
break
def run(self):
wikipedia.output(u'\03{lightblue}Running Export bot.\03{default}')
for page in self.pageGenerator:
wikipedia.output('\03{lightpurple}>\03{default} \03{lightaqua}Doing \03{lightpurple}%s\03{default}' % page.aslink())
self.exportPage(page)
class GEImport:
def run(self):
wikipedia.output(u'\03{lightblue}Running Import bot.\03{default}')
def main():
bot = None
action = None
# This factory is responsible for processing command line arguments
# that are also used by other scripts and that determine on which pages
# to work on.
genFactory = pagegenerators.GeneratorFactory()
gen = None
for arg in wikipedia.handleArgs():
if action == None:
action = arg
else:
generator = genFactory.handleArg(arg)
if generator:
gen = generator
if action == 'export':
if gen == None:
wikipedia.output(u'\03{lightred}Export bot needs a page generator to itterate over.\03{default}')
return
bot = GEExport(gen)
elif action == 'import':
bot = GEImport()
if bot == None:
wikipedia.output(u'\03{lightred}Invalid bot action to run.\03{default}')
return
bot.run()
if __name__ == "__main__":
try:
main()
finally:
wikipedia.stopme()