def get_data(self, item): try: content = item.get(get_redirect=True) content_to_db = remove_utf8mb4(content) edited = aware(convert_wiki_date(item.editTime())) redirect = item.isRedirectPage() return content, content_to_db, edited, redirect except NoPage: return None, None, None, None
def load_templates(): import pywikibot site = pywikibot.Site('ru') category_names = [ u"Категория:Викисловарь:Шаблоны:Языки", u"Категория:Шаблоны:Языковые_заголовки", ] for category_name in category_names: category = pywikibot.Category(site, category_name) for article in category.articles(): title = article.title() m = re.match(u'^Шаблон:-([^-].*)-$', title) if m: lang = m.group(1) print lang content = article.get(get_redirect=True) edited = aware(convert_wiki_date(article.editTime())) LanguageTemplate.objects.get_or_create(lang_code=lang, edited=edited, content=content)
def load_templates(): import pywikibot site = pywikibot.Site('ru') category_names = [ u"Категория:Шаблоны:Названия_языков", # u"Категория:Шаблоны:Языковые_заголовки", ] for category_name in category_names: category = pywikibot.Category(site, category_name) for article in category.articles(): title = article.title() m = re.match(u'^Шаблон:(?P<case>[Ll])ang-(?P<lang>.+)$', title) # if not m: # print title if m: lang = m.group('lang') case = m.group('case') lower = case == 'l' print lang content = article.get(get_redirect=True) edited = aware(convert_wiki_date(article.editTime())) TemplateLangNames.objects.get_or_create( lang=lang, lower=lower, edited=edited, content=content)
continue title = orig_title[len(prefix) :] print title if title == u"сущ": continue if not re.match(u"^сущ/[fmn]\d/\d+$", title): continue # print 'ok' # continue old = WordInflectionMassEdit.objects.get(title=title) old_content = old.content new_content = remove_utf8mb4(item.get(get_redirect=True)) edited = aware(convert_wiki_date(item.editTime())) print edited update_content = new_content if new_content != old_content: if u"{{пишу}}" in new_content or u"{{message box" in new_content: print u"×" * 20, u"{{пишу}} detected" first_edited = None break changing_started = False old_items, old_reports = parse_mass_edit(old_content) new_items, new_reports = parse_mass_edit(new_content) if len(old_items) != len(new_items) or set(old_items.keys()) != set(new_items.keys()): print u"×" * 20, "WRONG DIFFERENCE IN TITLES (COUNTS OR VALUES)" continue for word in sorted(old_items.keys(), key=lambda x: x[::-1]):
def process_item(self, item, i): # todo: create external mechanism of pausing work (actual for big processors) try: title = item.title() except InvalidTitle: print 'Wrong title', '#' * 120 return if ':' in title: # todo: we need this only for RecentProcessor if title.startswith(u"Шаблон:"): # if title.startswith(u"Категория:") or title.startswith(u"Шаблон:"): # print '-' * 40 pass else: return if self.readonly: return self.process_item_readonly(item, title, i) # if Page.objects.filter(title=title): # print dt(), title, '- exists' # return page = self.get_page(title, i) if not page: return content, content_to_db, edited, redirect = self.get_data(item) if not edited: # print dt(), title.encode('cp1251'), '-', 'DELETED', '#' * 20 print dt(), '& PAGE WAS DELETED - %d - & %s // pk=%d' \ % (i, transliterate(title), page.pk) # print dt(), transliterate(title), '-', 'DELETED', '#' * 10 # print dt(), '-', 'DELETED', '#' * 20 page.delete_and_log() return # print dt(), edited, '-', i, '-', title.encode('cp1251'), '| pk =', page.pk if not self.output_interval or not i % self.output_interval: print dt(), if ':' in title: print ':', else: print '#', print edited, '-', i, '-', if ':' in title: print ':', else: print '#', print transliterate(title), ' // pk=%s' % page.pk # print dt(), edited, '-', i, '-', '| pk =', page.pk log = transliterate(title) oldest = next(item.revisions(reverseOrder=True, total=1, content=True)) created_at = aware(convert_wiki_date(oldest.timestamp)) created_author = oldest.user created_lang = '?' if oldest.text is None: created_lang = '??' else: # print oldest.text # print repr(oldest.text) # print m = re.search(u'\{\{-([-\w]+|Праславянский)-(?:\|[^}]+)?\}\}', oldest.text, flags=re.MULTILINE | re.UNICODE) m2 = re.search(u'\{\{заголовок\|(en|tr|jbo|la|it|es|fo|da|de|pt|hr|pl|fi|lv|nl|sv|no|io|gd|az|ms|id|nv|nds|nah|hu|nrm|vls|fur|ga|hu|lb|hsb|li|tpi|gv|fr|cy|fy|sc|fo|zu|sw|mg|oc|ca|qu|ln|eo|so|cs|uz|et|vo|ku|su|sk|mi|kw|bar|br|an|sq|bs|af)\|add=\w*\}\}', oldest.text) m_new = re.search(u'\{\{NEW\|lang=([-a-z]+)\|cat=', oldest.text, flags=re.UNICODE | re.IGNORECASE) if m: created_lang = m.group(1) elif re.search(u'^= *Праславянский', oldest.text, flags=re.MULTILINE | re.UNICODE): created_lang = u'Праславянский' elif re.search(u'=<div style="background-color0?:#\{\{h1c\}\}">Эсперанто</div>=', oldest.text, flags=re.UNICODE | re.IGNORECASE): created_lang = u'eo' elif m2: created_lang = m2.group(1) elif re.search(u'== *\[\[?(:w)?:en:[^|]+\|английский\]\]? *== *\n', oldest.text, flags=re.UNICODE | re.IGNORECASE): created_lang = 'en' elif re.search(u'== *\[\[?(:w)?:de:[^|]+\|немецкий\]\]? *== *\n', oldest.text, flags=re.UNICODE | re.IGNORECASE): created_lang = 'de' elif re.search(u'== *\[?\[?(английский|english)\]?\]? *== *\n', oldest.text, flags=re.UNICODE | re.IGNORECASE): created_lang = 'en' elif re.search(u'== *\[?\[?(французский)\]?\]? *== *\n', oldest.text, flags=re.UNICODE | re.IGNORECASE): created_lang = 'fr' elif re.search(u'== *\[?\[?(италь?янский)\]?\]? *== *\n', oldest.text, flags=re.UNICODE | re.IGNORECASE): created_lang = 'it' elif re.search(u'== *\[?\[?(Нидерландский)\]?\]? *== *\n', oldest.text, flags=re.UNICODE | re.IGNORECASE): created_lang = 'nl' elif re.search(u'\{\{(английский|en)\}\}', oldest.text, flags=re.UNICODE | re.IGNORECASE): created_lang = 'en' elif re.search(u'\{\{(Нидерландский|nl)\}\}', oldest.text, flags=re.UNICODE | re.IGNORECASE): created_lang = 'nl' elif re.search(u'\{\{(немецкий|de)\}\}', oldest.text, flags=re.UNICODE | re.IGNORECASE): created_lang = 'de' elif re.search(u'\{\{(it)\}\}', oldest.text, flags=re.UNICODE | re.IGNORECASE): created_lang = 'it' elif m_new: created_lang = m_new.group(1) elif re.search(u'#(redirect|перенаправление)', oldest.text, flags=re.MULTILINE | re.UNICODE | re.IGNORECASE): created_lang = u'-' else: save_file(settings.FILES_PATH + "/errors/created_lang/%s.txt" % page.pk, oldest.text.encode('utf-8')) # print # print transliterate(title), created_at # print transliterate(oldest.user), transliterate(created_lang) # print self.update_data(page, content, content_to_db, edited, redirect, log, created_at, created_author, created_lang) if ':' in title: return return edited