/
populate_category.py
executable file
·180 lines (165 loc) · 6.79 KB
/
populate_category.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
#!/usr/bin/python
# -*- coding: utf-8 -*-
'''
Bot to categorize images based on gallery information
*Put uncategorized files in a category
*Put categorized files in a subcategory
'''
import sys
sys.path.append("../pywikipedia")
import wikipedia, config, pagegenerators
import re, imagerecat
import MySQLdb, config
def populateCategory(cat = None):
'''
Populate a category with uncategorized images and images from parent categories
'''
galleries = []
categories = []
uncatStats = 0
recatStats = 0
wikipedia.output(u'Working on ' + cat.title())
#Find gallery with the same name
if wikipedia.Page(wikipedia.getSite(), cat.titleWithoutNamespace()).exists():
galleries.append(cat.titleWithoutNamespace())
#Find hint
hint = findGalleryHint(cat.get())
if not hint == u'':
galleries.append(hint)
# Get the current categories
categories = imagerecat.getCurrentCats(cat)
if galleries:
imagesInGalleriesGenerator = getImagesInGalleriesAndCategories(galleries, categories)
for image in imagesInGalleriesGenerator:
if image.categories():
#The image contains categories
recatStats = recatStats + replaceCategory(image, categories, cat.titleWithoutNamespace())
else:
#No categories
uncatStats = uncatStats + addCategory(image, cat.titleWithoutNamespace())
#Remove the template, leave stats.
removePopulateCategoryTemplate(cat, uncatStats, recatStats)
def findGalleryHint (text = u''):
'''
Try to find a gallery hint in the Populate category template
'''
result = u''
gallery = None
p = re.compile('\{\{[pP]opulate category\|[gG]allery\=(?P<gallery>([^}]+))\}\}')
match = p.search(text)
if match:
gallery = wikipedia.Page(wikipedia.getSite(), match.group('gallery'))
wikipedia.output(u'Found a match: '+ gallery.title())
if gallery.exists():
wikipedia.output(u'Exists')
if gallery.namespace()==0:
result = gallery.titleWithoutNamespace()
wikipedia.output(result)
return result
def getImagesInGalleriesAndCategories (galleries = [], categories = []):
'''
Get a list of images to work on. The images are in one of galleries AND in one of the categories or uncategorized.
Each image : (galleries AND (categories OR uncategorized))
'''
result = None
categories.append(u'Media needing categories as of%')
query = u'SELECT DISTINCT imagepage.page_namespace, imagepage.page_title FROM page AS gallery '
query = query + u'JOIN imagelinks ON gallery.page_id = il_from '
query = query + u'JOIN page AS imagepage ON il_to=imagepage.page_title '
query = query + u'JOIN categorylinks ON imagepage.page_id=cl_from '
query = query + u'WHERE imagepage.page_namespace=6 AND imagepage.page_is_redirect=0 '
query = query + u'AND gallery.page_namespace=0 AND gallery.page_is_redirect=0 AND ('
if galleries and categories:
firstGallery = True
for gallery in galleries:
if firstGallery:
query = query + u'gallery.page_title=\'' + gallery.replace(u' ', u'_').replace(u"'", u"\\'") + u'\''
firstGallery = False
else:
query = query + u' OR gallery.page_title=\'' + gallery.replace(u' ', u'_').replace(u"'", u"\\'") + u'\''
query = query + u') AND ('
firstCategory = True
for category in categories:
if firstCategory:
query = query + u'cl_to LIKE \'' + category.replace(u' ', u'_').replace(u"'", u"\\'") + u'\''
firstCategory = False
else:
query = query + u' OR cl_to LIKE \'' + category.replace(u' ', u'_').replace(u"'", u"\\'") + u'\''
query = query + u') LIMIT 1000'
result = pagegenerators.MySQLPageGenerator(query)
return result
def addCategory (image = None, category = u''):
'''
Replace the uncategorized template with a category
'''
result = 0
if not category == u'':
oldtext = image.get()
newtext = re.sub(u'\{\{\s*([Uu]ncat(egori[sz]ed( image)?)?|[Nn]ocat|[Nn]eedscategory)[^}]*\}\}', u'[[Category:' + category + u']]', oldtext)
if not oldtext==newtext:
wikipedia.output(image.title())
wikipedia.showDiff(oldtext, newtext)
comment = u'Adding [[Category:' + category + u']] to this uncategorized image'
image.put (newtext, comment)
result = 1
return result
def getParentImages (parents = [], galleries = []):
'''
Get images which are in a parent category and in a gallery
'''
def replaceCategory (image = None, parents = [], newcat = u''):
'''
Remove all parent categories and add newcat
'''
result = 0
newcats = []
if not newcat == u'':
currentCats = imagerecat.getCurrentCats(image)
workingCategories = currentCats
workingCategories.append(newcat)
# Adding parents if the category filter is lagging.
# The bot often works on new categories. In these cases the filter does know the parent categories
workingCategories = workingCategories + parents
for cat in imagerecat.applyAllFilters(workingCategories):
#Now remove those parents again
if cat not in parents:
newcats.append(cat)
if not(set(currentCats)==set(newcats)):
newtext = wikipedia.removeCategoryLinks(image.get(), image.site()) + u'\n'
for category in newcats:
newtext = newtext + u'[[Category:' + category + u']]\n'
comment = u'Moving image to (a subcategory of) [[Category:' + newcat + u']] and trying to filter categories'
wikipedia.output(image.title())
wikipedia.showDiff(image.get(), newtext)
image.put(newtext, comment)
result = 1
return result
def removePopulateCategoryTemplate(page = None, uncatStats=0, recatStats=0):
'''
Remove {{populate category}}, include the stats in the comment
'''
oldtext = page.get()
newtext = re.sub(u'\{\{[Pp]opulate category\|?[^}]*\}\}', u'', oldtext)
if not oldtext==newtext:
wikipedia.showDiff(oldtext, newtext)
comment = u'Removing {{Populate category}}, bot categorized ' + str(uncatStats) + u' images and recategorized ' + str(recatStats) + u' images'
wikipedia.output(comment)
page.put (newtext, comment)
def main():
wikipedia.setSite(wikipedia.getSite(u'commons', u'commons'))
generator = None
for arg in wikipedia.handleArgs():
if arg.startswith('-page'):
if len(arg) == 5:
generator = [wikipedia.Page(wikipedia.getSite(), wikipedia.input(u'What page do you want to use?'))]
else:
generator = [wikipedia.Page(wikipedia.getSite(), arg[6:])]
if not generator:
generator = pagegenerators.NamespaceFilterPageGenerator(pagegenerators.ReferringPageGenerator(wikipedia.Page(wikipedia.getSite(), u'Template:Populate category'), onlyTemplateInclusion=True), [14])
for cat in generator:
populateCategory(cat)
if __name__ == "__main__":
try:
main()
finally:
wikipedia.stopme()