def save_step(self): if os.path.exists('stop'): print u'Stopping command because "stop" file exists.' self.stop() if not self.last_action_filename: raise NotImplementedError() # print '@', self.current_datetime, '-', "type: ", \ # self.last_action_filename if os.path.exists(self.state_filename): # print '# File exists' content = load_file(self.state_filename) last_action = datetime.strptime(content, self.date_format) # print last_action # print datetime.now() delta = datetime.now() - last_action # print delta if delta < timedelta(seconds=5): # print '<<<' # print return # save_file(self.last_action_filename, self.current_datetime) try: # print 'SAVED!!!', '@' * 100 save_file(self.state_filename, self.current_datetime) append_file(u'files/logs/%s-s' % self.last_action_filename, u'%s - %s' % (self.current_datetime, self.i)) except IOError: # pass # print print u'### IOError', '#' * 200
def save_db_to_file(): i = 0 print dt(), i words = dict() ## for item in Word_Description.objects.all()[:10000]: # .prefetch_related('details') ## for item in Word_Description.objects.prefetch_related('word', 'word__details').all()[:10000]: # for item in Word_Description.objects.iterate(100000): output = list() for item in Word_Value.objects.iterate(100000): #for item in Word_Description.objects.iterate(100000, ['word', 'word__details']): ## for item in Word_Description.objects.iterate(100): # word = "%s " % item.value # word = "%s %s" % (item.value, item.word.details.in_academic_lopatin) # # print word # if item.dictionary_id in [12, 13, 16]: # continue i += 1 # print repr(item.value) output.append("%s|%s" % (item.value, item.dictionary_id)) if not i % 10000: print dt(), i # break # if i > 300000: # break save_file('db_data.txt', '\n'.join(output), 'utf-8')
def after(self): result = '<div class="index">\n' for key in sorted(self.groups.keys()): result += '\n== %s ==\n' % key for value in sorted(self.groups[key], key=lambda x: x[::-1]): result += '* [[%s]]\n' % value result += '\n</div>\n' save_file(join(settings.DATA_PATH, 'сущ ru words.txt'), result)
def content_action(self, page, content, **kwargs): if self.i < 1050000: # or self.i > 950000: return folder = "%sx" % (page.id / 10000) path = os.path.join(settings.FILES_PATH, 'pages', folder) if not os.path.exists(path): os.mkdir(path) filename = os.path.join(path, "%s" % page.id) save_file(filename, content, encode='utf-8') if not self.i % 100: time.sleep(2)
def content_action(self, page, content, **kwargs): # if re.search(u'\n[^\n]+= *\{\{-ru-', content): # print page.title.strip(), '@' # if len(re.findall(u'\{\{-ru-[|}]', content)) > 1: # print page.title.strip(), '@' # if u'{{semantcat' in content: # save_file(page.title.replace(':', '_').replace('/', ' - ') + ".txt", # content.encode('utf-8')) if u'semantcat' in page.title: save_file(page.title.replace(':', '_').replace('/', ' - ') + ".tpl", content.encode('utf-8'))
def after(self): WordInflection.objects.bulk_add() result = u'<div class="index">\n' for key in sorted(self.groups.keys()): values = sorted(self.groups[key], key=lambda x: x[::-1]) result += u'\n== <nowiki>%s</nowiki> (%d) ==\n' % \ (key, len(values)) for value in values: result += u'* [[%s]]\n' % value result += u'\n</div>\n' save_file(join(settings.DATA_PATH, u'сущ ru words.txt'), result, encode='utf-8')
def analyze_words(words): result = list() for word, data in words.items(): dicts_list = list() for key, dicts in data.items(): # dicts_list += dicts # for d in dicts: # dicts_list.append(int(d)) for d in dicts: dicts_list.append(d.strip()) dicts_text = ','.join(dicts_list) # dicts_text = convert_to_dicts_template(dicts_list) if len(data) == 0: print '[0]', word # if len(data) == 1: # if word.upper() in data: # print '[1U]', word.upper(), '-', dicts_text # elif word.capitalize() in data: # print '[1C]', word.capitalize(), '-', dicts_text if len(data) > 1: if word.capitalize() in data: if word.lower() in data or word.upper() in data: if set(data[word.capitalize()]) == {'1'}: print print '[CC]', word.capitalize(), '-', dicts_text for key, dicts in data.items(): print ' -', key, '-', ','.join(dicts) print 'ok' # [CC] Дэу - 1,1 # - Дэу - 1 # - дэу - 1 # Сложный случай (есть оба!!) # [CC] Дуся - 1,2,4,3 # - Дуся - 1 # - дуся - 2,4,3 # если только в 6 маленькая - значит искать "[с прописной буквы]" # если в 1 - большая, а в 2 - маленькая, значит маленькая! # если только 1 и 6 - "Хэллоуин", "Мнемосина" # если только 8, то возможно он неадекватный?? "израилев" # if len(data) == 3: # print '[3]', word, '-', dicts_text # if len(data) > 3: # print '[*]', word, '-', dicts_text result.append("%s|%s" % (word, dicts_text)) save_file('total_list_of_words.txt', '\n'.join(result))
def create_file(): words = [] category = u'Русский_язык' path = join(categories_path, category) files = os.listdir(path) for filename in files: lines = open(join(path, filename)).read().split('\n') for line in lines: words.append(line) locale.setlocale(locale.LC_ALL) words = [word.decode('utf-8') for word in words] words = sorted(words, cmp=locale.strcoll) save_file(join(path, '..', 'wiktionary-russian-2013-08-02-text.txt'), "\n".join(words), 'utf-8') words2 = ["# [[%s]]" % word for word in words] save_file(join(path, '..', 'wiktionary-russian-2013-08-02-wiki.txt'), "\n".join(words2), 'utf-8')
def analyze_category(title): queue = [title] while queue: title = queue.pop() print u"→", title if re.search(r'[\\"?*|<>]', title): print " ×", title, "—", "bad symbols in title" continue if title in processed_titles: print " ×", title, "—", "already used" continue lang_skipping = re.match(u".*(/[-a-z]{2,8}|по языкам)$", title) # if lang_skipping: # print ' ×', title, '—', 'lang: skipping' # continue processed_titles.append(title) file_title = title.replace("/", "#").replace(":", "%") if lang_skipping: dirname = lang_skipping.group(1).replace("/", "") skip_path = join(categories_path, "#", dirname) if not exists(skip_path): os.mkdir(skip_path) filename = join(skip_path, file_title) else: filename = join(categories_path, file_title) # complete_filename = join(categories_path, 'complete', file_title) # blocked_filename = join(categories_path, 'blocked', file_title) # if exists(complete_filename): # # print u' ×', 'already exist' # return # if exists(blocked_filename): # print u' ×', title, '—', 'blocked' # return if exists(filename): # print u' ←', 'exist, reading' base_titles, sub_titles = read_titles(filename) else: category = get_category("%s%s" % (category_prefix, title)) base_titles = process_categories(category.categories()) sub_titles = process_categories(category.subcategories()) base_content = "\n".join(map(lambda x: "< %s" % x, base_titles)) or "-" sub_content = "\n".join(map(lambda x: "> %s" % x, sub_titles)) or "-" content = "%s\n\n%s\n" % (base_content, sub_content) save_file(filename, content, encode="utf-8") print u" +", title, "—", "saved" for sub_title in base_titles + sub_titles: if sub_title not in processed_titles: queue.append(sub_title)
def download_pages(): category = u'Русский_язык' print category print '=' * 10 path = join(categories_path, category) if not exists(path): os.mkdir(path) p = 0 url = get_category_page_url(category) while True: print dt(), 'Page', p content = download(url) url, content = extract_pages_and_next_link(content) url = 'http://ru.wiktionary.org%s' % url filename = join(path, 'p%d' % p) save_file(filename, content, 'utf-8') p += 1 time.sleep(0.1)
def handle(self, *args, **options): if not self.name: raise NotImplementedError() append_file('files/timing', u'%s @ %s' % (self.current_datetime, self.name)) print '@', self.current_datetime, '-', "Starting wikt command:", \ self.name is_started = os.path.exists(self.state_filename) if is_started: print '# Command already started' if not self.skip_mail: send_wiki_mail( subject="Wikt command already started: " + self.name, message=open(self.state_filename).read() ) return save_file(self.state_filename, "Started at " + self.current_datetime) self.wikt_handle(*args, **options) os.remove(self.state_filename) append_file('files/timing', u'%s . %s' % (self.current_datetime, self.name))
def update(self, page, value, log=False, edited=None, is_content=False, content_with_utf8mb4=None): if log and not is_content or not log and is_content: raise Exception('log and is_content should be both set or unset') if not self.field_name: return obj, created = self.get_or_create(page=page) changed = getattr(obj, self.field_name) != value if is_content: #folder = "%sx" % (page.id / 10000) #path = os.path.join(settings.FILES_PATH, 'pages', folder) #if not os.path.exists(path): # os.mkdir(path) #filename = os.path.join(path, "%s" % page.id) #old_content = '' #exist = os.path.exists(filename) #if exist: # old_content = load_file(filename, decode='utf-8') block_num = page.id / 1000 filename = "%sx" % block_num path = os.path.join(settings.CONTENT_FILES_PATH, filename) if os.path.exists(path): contents = load_file(path, decode='utf-8').split(G.separator) else: contents = [''] * 1000 old_content = contents[page.id % 1000] new_content = content_with_utf8mb4 page.cached_content = new_content if old_content != new_content: contents[page.id % 1000] = new_content save_file(path, G.separator.join(contents), encode='utf-8') print ' $ pk=%d: content updated (file)' % page.id print >> sys.stderr, '... Updated at %s: %s // pk=%d' % \ (edited, log, obj.pk) if changed: self.updating(obj, value) if log: print >> sys.stderr, '::: Updated at %s: %s // pk=%d' % \ (edited, log, obj.pk)
def parse_htmls(): path = YANDEX_SLOVARI_UK_RU_PATH files = os.listdir(path) for filename in files: filename = filename.decode('cp1251') print filename content = load_file(join(path, filename), decode='utf-8') # print content for block in ['left', 'center', 'right']: text = re.search('<td class="l-page__%s">(.*?)</td>' % block, content, flags=re.DOTALL).group(1) text = re.sub('<div id="b-direct-(bottom|right)"[^>]*></div>', '', text, flags=re.DOTALL) text = re.sub('<div class="b-translation__controls">.*?</div>', '', text, flags=re.DOTALL) text = re.sub('<i class="b-statcounter">.*</i>', '', text, flags=re.DOTALL) new_filename = join(YANDEX_SLOVARI_UK_RU_PARSE_PATH, block, filename) # if not exists(new_filename): save_file(new_filename, text.strip(), encode='utf-8')
def display_all_codes(letter): prefetch = ['en_wiki', 'equals', 'wikt_en', 'wikt_en__en_wiki', 'wikt_en__en_wiki__redirect_to'] items = LangCode.objects.filter(code__startswith=letter).prefetch_related(*prefetch).order_by('code') for lang in items: code = lang.code # print code lang.en_items = [] for item in LangWikiEnIso1.objects.filter(code=lang): lang.en_items.append(item) for item in LangWikiEnIso3.objects.filter(code=lang): lang.en_items.append(item) lang.ru_items = [] for item in LangWikiRuIso1.objects.filter(code=lang): lang.ru_items.append(item) for item in LangWikiRuIso3.objects.filter(code=lang): lang.ru_items.append(item) content = render_to_string("wiki/iso_langs.html", {'items': items, 'letter': letter}) filename = os.path.join(settings.ROOT_PATH, 'wikt', 'data', 'langs_%s.txt' % letter) save_file(filename, content.replace('\r', ''), encode='utf-8') return content
def download(slug, imin, imax, step=1, subdomain=False, missed=False): if subdomain: # actually don't needed url = 'http://%s.academic.ru/%%s/' % slug else: url = 'http://dic.academic.ru/dic.nsf/%s/%%s/' % slug if not missed: path_download = join(BASE_PATH, slug) path_out = None path_empty = None else: # if ids have holes if subdomain: path_out = 'c:/download/%s.academic.ru.out' % slug path_download = 'c:/download/%s.academic.ru.download' % slug path_empty = '' else: path_out = join(BASE_PATH, '%s.words.out' % slug) path_download = join(BASE_PATH, '%s.words.download' % slug) path_empty = join(BASE_PATH, '%s.words.download.empty' % slug) if not exists(path_download): os.mkdir(path_download) for i in range(imin, imax, step): name = str(i) dt = datetime.now().strftime("[%H:%M:%S]") sslug = "(%s)" % slug filename = join(path_download, name) if exists(filename): print dt, name, sslug, '+++' elif path_out and exists(join(path_out, name)): print dt, name, sslug, '+' elif path_empty and exists(join(path_empty, name)): print dt, name, sslug, '---' else: print dt, name, sslug, 'downloading' content = urllib.urlopen(url % name).read().replace('\r', '') save_file(filename, content) time.sleep(0.1)
def after(self): super(SlogiReplacer, self).after() self.content += "|-\n|}" save_file(join(settings.DATA_PATH, 'slogi.txt'), self.content, encode='utf-8')
def parse_parsed_center(): path = join(YANDEX_SLOVARI_UK_RU_PARSE_PATH, 'center') files = os.listdir(path) for filename in files: filename = filename.decode('cp1251') new_filename = join(YANDEX_SLOVARI_UK_RU_PARSE_PATH, 'center.parsed', filename) # print filename content = load_file(join(path, filename), decode='utf-8') # print content p = re.compile('<div class="b-did-you-mean b-did-you-mean_margin_yes">(.+?)</div>', flags=re.DOTALL) m = p.search(content) if m: # print filename value = m.group(1) content = p.sub('', content) m = re.match(u'<span class="b-did-you-mean__content">Быть может, вы искали: «<a class="ajax ajax-full" href="/[^/]+/uk-ru"><strong>([^>]+)</strong></a>»\?</span>', value) if not m: m = re.match(u'<span class="b-did-you-mean__content">Быть может, вы искали одно из этих слов: («<a class="ajax ajax-full" href="/[^/]+/uk-ru"><strong>([^>]+)</strong></a>»(, )?)+\?</span>', value) if not m: print 'x' * 100 print value save_file(new_filename, '?', encode='utf-8') continue # wrong p = re.compile(u'<div class="b-misspell">В запросе «<strong>(.*?)</strong>» исправлена опечатка</div>') m = p.search(content) if m: misspell = m.group(1) misspell = re.sub('</?em>', '', misspell) if misspell != filename: print 'x' * 100 print misspell print filename content = p.sub('', content).strip() p = re.compile(u'<h6 class="b-translation__fulltext-link">Переведено с украинского на русский<img src="[^"]+"> по технологии «<a target="_blank" href="[^"]+">Яндекс.Перевод</a>».</h6><p class="b-translation__disclaimer">Пожалуйста, пользуйтесь результатом машинного перевода с осторожностью, возможны ошибки.</p>') m = p.search(content) if m: content = p.sub('', content) m = re.match('^<div class="b-translation i-bem" onclick="[^"]+">\s*<div class="b-translation__fulltext"><p>([^>]+)</p></div>\s*</div>$', content) if not m: print filename print repr(filename) print content print # pass translate = m.group(1) save_file(new_filename, 'translated: %s' % translate, encode='utf-8') # print filename # print translate # print continue # ok p = re.compile(u'^<div class="b-translation b-translation_type_example i-bem" onclick="[^"]+">\s*<div class="b-translation__group"><div class="b-translation__group-line"><span class="b-translation__group-title">Найдено в примерах</span></div>\</div>(.*)\s*</div>$', flags=re.DOTALL) m = p.match(content) if m: content = m.group(1).strip() # print filename # m = re.match(u'^(<h1 class="b-translation__example-title">(.*?)</h1><div class="b-translation__example">(.*?)</div><div class="b-translation__example-source">.*</div>)+$', content) # if not m: # print content items = re.findall(u'<h1 class="b-translation__example-title">(.*?)</h1><div class="b-translation__example">(.*?)</div><div class="b-translation__example-source">(.*?)</div>', content) for title, example, source in items: title = remove_span(title) example = remove_span(example) source = remove_span(source) # print title # print example # print source # print if not re.match(u'^(Русско-украинский|Украинско-русский) словарь\. ИТФ «Перун» › .*', source): print 'x' * 100 print source continue # ok p = re.compile(u'^<div class="b-translation i-bem" onclick="[^"]+">\s*<div class="b-translation__card b-translation__card_examples_three"><h1 class="b-translation__title"><span class="b-translation__text">(.*?)</span>(?: <img class="b-icon b-icon_type_audio-big" src="[^"]+" alt="Прослушать" title="Прослушать" onclick="[^"]+">)?</h1>(.*)</div>\s*</div>$') m = p.match(content) if m: title = m.group(1) content = m.group(2).strip() # if title != filename: # print filename # print title # print # m = re.match('(<div class="b-translation__group[^"]*">(.*?)</div>)+', content) # if not m: # print content # if re.search('<div class="b-translation__group">', content): # print filename # print content # print # pass # print content items = re.split('<div class="b-translation__group[^"]*">', content) for item in items: if not item: continue # print item # print p = re.compile(u'<div class="b-translation__grammar-note">(с|ж)</div>') m = p.search(item) if m: # print 'ok' item = p.sub('', item).strip() p = re.compile('<h2 class="b-translation__group-title" id="([^"]+)">(.*?)</h2>') m = p.search(item) if m: morpho_id = m.group(1) morpho_title = m.group(2) # print morpho_id, morpho_title item = p.sub('', item).strip() m = re.match('<ol class="b-translation__entries[^"]*">(.*)</ol>(.*)</div>', item) if not m: print 'x' * 100 print filename print item print lines = m.group(1).strip() tail = m.group(2).strip() if tail and not re.match('<div class="b-translation__rejex">(.*)</div>', tail): print 'x' * 100 print tail lines = re.findall('<li class="b-translation__entry">(.*?)</li>', lines) for line in lines: m = re.match('^<div class="b-translation__translation"><span class="b-translation__translation-words">(.*)</span></div>(?:<div class="b-translation__examples">(.*)</div>)?$', line) if not m: print 'x' * 100 print line continue line = m.group(1) examples = m.group(2) if examples: # print examples # print pass line = remove_span(line) p = re.compile('<a class="b-translation__link ajax ajax-full" href="[^"]+">([^<]+)</a>') line = p.sub(r'[\1]', line) line = line.replace(u'¸', u'ё') line = line.replace(u'<sup class="b-translation__src-num"></sup>', '') # print filename # print line # print continue # ok if re.match(u'^<div class="b-translation i-bem" onclick="[^"]+"><div class="b-nothing-found"><div class="b-nothing-found__message">К сожалению, мы ничего не нашли\.</div>Попробуйте поискать <a class="b-link" href="[^"]+">в энциклопедиях</a>\.</div></div>$', content): save_file(new_filename, '', encode='utf-8') continue # not found if re.match(u'^<div class="b-translation i-bem" onclick="[^"]+"><div class="b-nothing-found"><div class="b-nothing-found__failure">Похоже, что-то пошло не так. Пожалуйста, попробуйте обновить страницу.</div></div></div>$', content): # print filename old_filename = join(YANDEX_SLOVARI_UK_RU_PATH, filename) if exists(old_filename): os.rename(old_filename, join(YANDEX_SLOVARI_PATH, 'uk-ru-bad', filename)) print u"Файл перемещён: ", filename continue # bad print content print
def create_file(): words = [] files = os.listdir(path) for filename in files: lines = open(join(path, filename)).read().split("\n") # print filename.decode('cp1251'), "\t", len(lines) # continue for line in lines: line = line.strip().decode("utf-8") if not line or line in [ u"{{Русский индекс}}", u"{{Русский индекс}}", u"__TOC__", u"[[ъ]]", u"[[ь]]", u":''В русском языке нет слов, начинающихся c твёрдого знака.''", u":''В русском языке нет слов, начинающихся с мягкого знака.''", u"[[uk:НА]]", u"[[uk:НЕ]]", u"[[uk:ПО]]", ]: continue skip_patterns = [ u"^см. \[\[(/[а-яё])\|([а-яё]{2})\]\]$", u"^== ?([-а-яё/]+) ?==$", u"^\[\[[a-z]{2}:index [а-яё]+\]\]$", u"^\[\[en:Index:Russian/[а-яё]\]\]$", ] skip = False for pattern in skip_patterns: if match(pattern, line): skip = True break if skip: continue m = re.match(u"^\* ?\[\[([^]]+)\]\]!?( ?\{\{И1\}\})?$", line) if m: word = m.group(1) lopatin = m.group(2) # print line # print word, lopatin words.append(word) else: m = match(u"^\* ?\[\[([^]]+)\]\]!?([^{}]*)( ?\{\{И1\}\})?$", line) if m: word = m.group(1) # print repr(word) # print repr(word.encode('utf-8')) lopatin = m.group(3) words.append(word) # print line # print word, lopatin else: print "#" * 100 print filename.decode("cp1251") print line locale.setlocale(locale.LC_ALL, "") # todo: try ru.UTF-8 words = sorted(words, cmp=locale.strcoll) save_file(join(path, "..", "wiktionary-index-2013-08-03-text.txt"), "\n".join(words), "utf-8") # words2 = sorted(["# [[%s]]" % word for word in words], key=lambda s: s.lower()) words2 = ["# [[%s]]" % word for word in words] save_file(join(path, "..", "wiktionary-index-2013-08-03-wiki.txt"), "\n".join(words2), "utf-8") # words3 = sorted(["# [[:%s]]" % word for word in words], key=lambda s: s.lower()) words3 = ["# [[:%s]]" % word for word in words] save_file(join(path, "..", "wiktionary-index-2013-08-03-wiki2.txt"), "\n".join(words3), "utf-8")
def after(self): print dt(), 'finished!', datetime.now() - self.s if settings.USE_FINISHED_FILENAME: save_file(self.finished_filename, unicode(datetime.now()))
def get_and_save_wiki(title, filename): url = get_edit_page_url(title) content = download(url) wiki = extract_wiki_text(content) save_file(filename, wiki, 'utf8') return wiki
def process_item(self, item, i): # todo: create external mechanism of pausing work (actual for big processors) try: title = item.title() except InvalidTitle: print 'Wrong title', '#' * 120 return if ':' in title: # todo: we need this only for RecentProcessor if title.startswith(u"Шаблон:"): # if title.startswith(u"Категория:") or title.startswith(u"Шаблон:"): # print '-' * 40 pass else: return if self.readonly: return self.process_item_readonly(item, title, i) # if Page.objects.filter(title=title): # print dt(), title, '- exists' # return page = self.get_page(title, i) if not page: return content, content_to_db, edited, redirect = self.get_data(item) if not edited: # print dt(), title.encode('cp1251'), '-', 'DELETED', '#' * 20 print dt(), '& PAGE WAS DELETED - %d - & %s // pk=%d' \ % (i, transliterate(title), page.pk) # print dt(), transliterate(title), '-', 'DELETED', '#' * 10 # print dt(), '-', 'DELETED', '#' * 20 page.delete_and_log() return # print dt(), edited, '-', i, '-', title.encode('cp1251'), '| pk =', page.pk if not self.output_interval or not i % self.output_interval: print dt(), if ':' in title: print ':', else: print '#', print edited, '-', i, '-', if ':' in title: print ':', else: print '#', print transliterate(title), ' // pk=%s' % page.pk # print dt(), edited, '-', i, '-', '| pk =', page.pk log = transliterate(title) oldest = next(item.revisions(reverseOrder=True, total=1, content=True)) created_at = aware(convert_wiki_date(oldest.timestamp)) created_author = oldest.user created_lang = '?' if oldest.text is None: created_lang = '??' else: # print oldest.text # print repr(oldest.text) # print m = re.search(u'\{\{-([-\w]+|Праславянский)-(?:\|[^}]+)?\}\}', oldest.text, flags=re.MULTILINE | re.UNICODE) m2 = re.search(u'\{\{заголовок\|(en|tr|jbo|la|it|es|fo|da|de|pt|hr|pl|fi|lv|nl|sv|no|io|gd|az|ms|id|nv|nds|nah|hu|nrm|vls|fur|ga|hu|lb|hsb|li|tpi|gv|fr|cy|fy|sc|fo|zu|sw|mg|oc|ca|qu|ln|eo|so|cs|uz|et|vo|ku|su|sk|mi|kw|bar|br|an|sq|bs|af)\|add=\w*\}\}', oldest.text) m_new = re.search(u'\{\{NEW\|lang=([-a-z]+)\|cat=', oldest.text, flags=re.UNICODE | re.IGNORECASE) if m: created_lang = m.group(1) elif re.search(u'^= *Праславянский', oldest.text, flags=re.MULTILINE | re.UNICODE): created_lang = u'Праславянский' elif re.search(u'=<div style="background-color0?:#\{\{h1c\}\}">Эсперанто</div>=', oldest.text, flags=re.UNICODE | re.IGNORECASE): created_lang = u'eo' elif m2: created_lang = m2.group(1) elif re.search(u'== *\[\[?(:w)?:en:[^|]+\|английский\]\]? *== *\n', oldest.text, flags=re.UNICODE | re.IGNORECASE): created_lang = 'en' elif re.search(u'== *\[\[?(:w)?:de:[^|]+\|немецкий\]\]? *== *\n', oldest.text, flags=re.UNICODE | re.IGNORECASE): created_lang = 'de' elif re.search(u'== *\[?\[?(английский|english)\]?\]? *== *\n', oldest.text, flags=re.UNICODE | re.IGNORECASE): created_lang = 'en' elif re.search(u'== *\[?\[?(французский)\]?\]? *== *\n', oldest.text, flags=re.UNICODE | re.IGNORECASE): created_lang = 'fr' elif re.search(u'== *\[?\[?(италь?янский)\]?\]? *== *\n', oldest.text, flags=re.UNICODE | re.IGNORECASE): created_lang = 'it' elif re.search(u'== *\[?\[?(Нидерландский)\]?\]? *== *\n', oldest.text, flags=re.UNICODE | re.IGNORECASE): created_lang = 'nl' elif re.search(u'\{\{(английский|en)\}\}', oldest.text, flags=re.UNICODE | re.IGNORECASE): created_lang = 'en' elif re.search(u'\{\{(Нидерландский|nl)\}\}', oldest.text, flags=re.UNICODE | re.IGNORECASE): created_lang = 'nl' elif re.search(u'\{\{(немецкий|de)\}\}', oldest.text, flags=re.UNICODE | re.IGNORECASE): created_lang = 'de' elif re.search(u'\{\{(it)\}\}', oldest.text, flags=re.UNICODE | re.IGNORECASE): created_lang = 'it' elif m_new: created_lang = m_new.group(1) elif re.search(u'#(redirect|перенаправление)', oldest.text, flags=re.MULTILINE | re.UNICODE | re.IGNORECASE): created_lang = u'-' else: save_file(settings.FILES_PATH + "/errors/created_lang/%s.txt" % page.pk, oldest.text.encode('utf-8')) # print # print transliterate(title), created_at # print transliterate(oldest.user), transliterate(created_lang) # print self.update_data(page, content, content_to_db, edited, redirect, log, created_at, created_author, created_lang) if ':' in title: return return edited
def after(self): super(A, self).after() save_file(settings.FILES_PATH + '/data/not_lopatin_transcriptions.txt', self.result, encode=u'utf-8')
# coding: utf-8 from dictionaries.utils.file import save_file from wikt.commons.utils.wikibot import get_wiki_page_content s = u'|egy=[[𓊪𓏏𓇯]]' with open('test_utf.txt', mode='w') as f: f.write(s.encode('utf-8')) content = get_wiki_page_content(u'небо') save_file(u'небо-t.txt', content, encode=u'utf-8')
from wikt.models import Page from wikt.tasks.parse.yandex_slovari.common import YANDEX_SLOVARI_UK_RU_PATH class AppURLopener(urllib.FancyURLopener): version = "Opera/9.80 (Windows NT 6.2; WOW64) Presto/2.12.388 Version/12.16" urllib._urlopener = AppURLopener() words = Page.objects.filter(page_content__content__contains="= {{-uk-").\ values_list('title', flat=True) print len(words) for word in words: if word[0] == word[0].upper(): continue # print dt(), word filename = join(YANDEX_SLOVARI_UK_RU_PATH, word) if os.path.exists(filename): # print u'→ exist' continue print dt(), word word_url = urllib.quote_plus(word.encode('utf-8')) # print urllib.quote_plus("привет") url = u"http://slovari.yandex.ru/%s/uk-ru" % word_url content = urllib.urlopen(url).read() # print content save_file(filename, content) time.sleep(15)
# if not self.i % 100: # time.sleep(2) if __name__ == '__main__': # OldDumpContentToFiles().run() old_block_num = 0 block = [''] * 1000 # print block # sys.exit() last_pk = Page.objects.order_by('-pk')[0].pk for page in Page.objects.iterate(prefetch=['page_content']): # start_pk=1195000 new_block_num = page.id / 1000 i = page.id % 1000 # if not page.id % 10000: # print page.id # if page.id < 1196000: # continue if new_block_num != old_block_num or page.id == last_pk: if page.id == last_pk: block[i] = page.page_content.content filename = "%sx" % old_block_num path = os.path.join(settings.CONTENT_FILES_PATH, filename) print path save_file(path, G.separator.join(block), encode='utf-8') block = [''] * 1000 old_block_num = new_block_num block[i] = page.page_content.content # print page.title