def save_step(self): if os.path.exists('stop'): print u'Stopping command because "stop" file exists.' self.stop() if not self.last_action_filename: raise NotImplementedError() # print '@', self.current_datetime, '-', "type: ", \ # self.last_action_filename if os.path.exists(self.state_filename): # print '# File exists' content = load_file(self.state_filename) last_action = datetime.strptime(content, self.date_format) # print last_action # print datetime.now() delta = datetime.now() - last_action # print delta if delta < timedelta(seconds=5): # print '<<<' # print return # save_file(self.last_action_filename, self.current_datetime) try: # print 'SAVED!!!', '@' * 100 save_file(self.state_filename, self.current_datetime) append_file(u'files/logs/%s-s' % self.last_action_filename, u'%s - %s' % (self.current_datetime, self.i)) except IOError: # pass # print print u'### IOError', '#' * 200
def parse_parsed_left(): path = join(YANDEX_SLOVARI_UK_RU_PARSE_PATH, 'left') files = os.listdir(path) for filename in files: filename = filename.decode('cp1251') # print filename content = load_file(join(path, filename), decode='utf-8') # print content items = re.findall('<li class="b-translation-review__item b-translation-review__current">(.*?)</li>', content, flags=re.DOTALL) if items: saved = False for item in items: m = re.match('<h3 class="b-translation-review__language">' '<a class="b-translation-review__link ajax ajax-full" ' 'href="[^"]+" onclick="[^"]+">([^<]+)</a>' '</h3>' '<span class="b-translation-review__translation">(.+?)</span>', item) if not m: print '#' * 100 print filename print item lang = m.group(1) value = m.group(2) if lang != u'с украинского': print 'x' * 100 print lang print value print if '[' in value: print 'v' * 100 if '\n' in value: print 'n' * 100 p = re.compile('<a class="b-translation-review__link ajax ajax-full" href="[^"]+" onclick="[^"]+">([^<]+)</a>') m = p.match(value) if m: check_word = m.group(1) if check_word == filename: print '$' * 100 value = p.sub(r'[\1]', value) # print filename # print check_word # print repr(filename) # print repr(check_word) # print value # print value = value.replace(u'¸', u'ё') if saved: print filename, 'ALREADY SAVED', value # save_file(join(YANDEX_SLOVARI_UK_RU_PARSE_PATH, 'left.parsed', filename), # value, encode='utf-8') # print value saved = True
def update(self, page, value, log=False, edited=None, is_content=False, content_with_utf8mb4=None): if log and not is_content or not log and is_content: raise Exception('log and is_content should be both set or unset') if not self.field_name: return obj, created = self.get_or_create(page=page) changed = getattr(obj, self.field_name) != value if is_content: #folder = "%sx" % (page.id / 10000) #path = os.path.join(settings.FILES_PATH, 'pages', folder) #if not os.path.exists(path): # os.mkdir(path) #filename = os.path.join(path, "%s" % page.id) #old_content = '' #exist = os.path.exists(filename) #if exist: # old_content = load_file(filename, decode='utf-8') block_num = page.id / 1000 filename = "%sx" % block_num path = os.path.join(settings.CONTENT_FILES_PATH, filename) if os.path.exists(path): contents = load_file(path, decode='utf-8').split(G.separator) else: contents = [''] * 1000 old_content = contents[page.id % 1000] new_content = content_with_utf8mb4 page.cached_content = new_content if old_content != new_content: contents[page.id % 1000] = new_content save_file(path, G.separator.join(contents), encode='utf-8') print ' $ pk=%d: content updated (file)' % page.id print >> sys.stderr, '... Updated at %s: %s // pk=%d' % \ (edited, log, obj.pk) if changed: self.updating(obj, value) if log: print >> sys.stderr, '::: Updated at %s: %s // pk=%d' % \ (edited, log, obj.pk)
def parse_htmls(): path = YANDEX_SLOVARI_UK_RU_PATH files = os.listdir(path) for filename in files: filename = filename.decode('cp1251') print filename content = load_file(join(path, filename), decode='utf-8') # print content for block in ['left', 'center', 'right']: text = re.search('<td class="l-page__%s">(.*?)</td>' % block, content, flags=re.DOTALL).group(1) text = re.sub('<div id="b-direct-(bottom|right)"[^>]*></div>', '', text, flags=re.DOTALL) text = re.sub('<div class="b-translation__controls">.*?</div>', '', text, flags=re.DOTALL) text = re.sub('<i class="b-statcounter">.*</i>', '', text, flags=re.DOTALL) new_filename = join(YANDEX_SLOVARI_UK_RU_PARSE_PATH, block, filename) # if not exists(new_filename): save_file(new_filename, text.strip(), encode='utf-8')
def iterate(self, chunksize=1000, prefetch=None, my_last_pk=None, show_message=True, start_pk=0): pk = start_pk if my_last_pk: last_pk = my_last_pk else: last_pk = self.order_by('-pk')[0].pk if show_message: print dt(), last_pk queryset = self.order_by('pk') # i = 1 if settings.CONTENT_IN_FILES: pk = 0 while pk < last_pk: if settings.CONTENT_IN_FILES: items = queryset.filter(pk__gte=pk) else: items = queryset.filter(pk__gt=pk) if prefetch: items = items.prefetch_related(*prefetch) if settings.CONTENT_IN_FILES: block_num = pk / 1000 filename = "%sx" % block_num path = os.path.join(settings.CONTENT_FILES_PATH, filename) contents = load_file(path, decode='utf-8').split(G.separator) for row in items[:chunksize]: if row.pk >= pk + 1000: break row.cached_content = contents[row.pk % 1000] # if row.cached_content != row.page_content.content: # raise Exception('!!!') yield row else: for row in items[:chunksize]: pk = row.pk yield row if settings.CONTENT_IN_FILES: pk += 1000 gc.collect()
def content(self): if settings.CONTENT_IN_FILES: if hasattr(self, 'cached_content'): return self.cached_content else: block_num = self.id / 1000 filename = "%sx" % block_num path = os.path.join(settings.CONTENT_FILES_PATH, filename) if not os.path.exists(path): raise Exception('Content file for page %s not found.' % self.id) contents = load_file(path, decode='utf-8').split(G.separator) self.cached_content = contents[self.id % 1000] return self.cached_content # folder = "%sx" % (self.id / 10000) # path = os.path.join('d:/!!!', 'pages', folder) # if not os.path.exists(path): # raise Exception('Content file for page %s not found.' % self.id) # filename = os.path.join(path, "%s" % self.id) # if not os.path.exists(filename): # raise Exception('Content file for page %s not found.' % self.id) # return load_file(filename, decode='utf-8') return self.page_content.content
def check_for_start(self): if not self.last_action_filename: raise NotImplementedError() last_action = datetime.now() # default value for email delta = 'None' if os.path.exists(self.state_filename): content = load_file(self.state_filename) last_action = datetime.strptime(content, self.date_format) delta = datetime.now() - last_action if delta < timedelta(minutes=3) and not self.skip_mail: print "Don't start yet: " + self.last_action_filename send_wiki_mail( subject="Don't start yet: " + self.last_action_filename, message="%s // %s" % (last_action, delta) ) raise Exception("Don't start yet!!") print "Successfully started: " + self.last_action_filename send_wiki_mail( subject="Successfully started: " + self.last_action_filename, message="%s // %s" % (last_action, delta) ) append_file('files/timing', u'%s > %s' % (self.current_datetime_Kyiv, self.last_action_filename))
def parse_parsed_center(): path = join(YANDEX_SLOVARI_UK_RU_PARSE_PATH, 'center') files = os.listdir(path) for filename in files: filename = filename.decode('cp1251') new_filename = join(YANDEX_SLOVARI_UK_RU_PARSE_PATH, 'center.parsed', filename) # print filename content = load_file(join(path, filename), decode='utf-8') # print content p = re.compile('<div class="b-did-you-mean b-did-you-mean_margin_yes">(.+?)</div>', flags=re.DOTALL) m = p.search(content) if m: # print filename value = m.group(1) content = p.sub('', content) m = re.match(u'<span class="b-did-you-mean__content">Быть может, вы искали: «<a class="ajax ajax-full" href="/[^/]+/uk-ru"><strong>([^>]+)</strong></a>»\?</span>', value) if not m: m = re.match(u'<span class="b-did-you-mean__content">Быть может, вы искали одно из этих слов: («<a class="ajax ajax-full" href="/[^/]+/uk-ru"><strong>([^>]+)</strong></a>»(, )?)+\?</span>', value) if not m: print 'x' * 100 print value save_file(new_filename, '?', encode='utf-8') continue # wrong p = re.compile(u'<div class="b-misspell">В запросе «<strong>(.*?)</strong>» исправлена опечатка</div>') m = p.search(content) if m: misspell = m.group(1) misspell = re.sub('</?em>', '', misspell) if misspell != filename: print 'x' * 100 print misspell print filename content = p.sub('', content).strip() p = re.compile(u'<h6 class="b-translation__fulltext-link">Переведено с украинского на русский<img src="[^"]+"> по технологии «<a target="_blank" href="[^"]+">Яндекс.Перевод</a>».</h6><p class="b-translation__disclaimer">Пожалуйста, пользуйтесь результатом машинного перевода с осторожностью, возможны ошибки.</p>') m = p.search(content) if m: content = p.sub('', content) m = re.match('^<div class="b-translation i-bem" onclick="[^"]+">\s*<div class="b-translation__fulltext"><p>([^>]+)</p></div>\s*</div>$', content) if not m: print filename print repr(filename) print content print # pass translate = m.group(1) save_file(new_filename, 'translated: %s' % translate, encode='utf-8') # print filename # print translate # print continue # ok p = re.compile(u'^<div class="b-translation b-translation_type_example i-bem" onclick="[^"]+">\s*<div class="b-translation__group"><div class="b-translation__group-line"><span class="b-translation__group-title">Найдено в примерах</span></div>\</div>(.*)\s*</div>$', flags=re.DOTALL) m = p.match(content) if m: content = m.group(1).strip() # print filename # m = re.match(u'^(<h1 class="b-translation__example-title">(.*?)</h1><div class="b-translation__example">(.*?)</div><div class="b-translation__example-source">.*</div>)+$', content) # if not m: # print content items = re.findall(u'<h1 class="b-translation__example-title">(.*?)</h1><div class="b-translation__example">(.*?)</div><div class="b-translation__example-source">(.*?)</div>', content) for title, example, source in items: title = remove_span(title) example = remove_span(example) source = remove_span(source) # print title # print example # print source # print if not re.match(u'^(Русско-украинский|Украинско-русский) словарь\. ИТФ «Перун» › .*', source): print 'x' * 100 print source continue # ok p = re.compile(u'^<div class="b-translation i-bem" onclick="[^"]+">\s*<div class="b-translation__card b-translation__card_examples_three"><h1 class="b-translation__title"><span class="b-translation__text">(.*?)</span>(?: <img class="b-icon b-icon_type_audio-big" src="[^"]+" alt="Прослушать" title="Прослушать" onclick="[^"]+">)?</h1>(.*)</div>\s*</div>$') m = p.match(content) if m: title = m.group(1) content = m.group(2).strip() # if title != filename: # print filename # print title # print # m = re.match('(<div class="b-translation__group[^"]*">(.*?)</div>)+', content) # if not m: # print content # if re.search('<div class="b-translation__group">', content): # print filename # print content # print # pass # print content items = re.split('<div class="b-translation__group[^"]*">', content) for item in items: if not item: continue # print item # print p = re.compile(u'<div class="b-translation__grammar-note">(с|ж)</div>') m = p.search(item) if m: # print 'ok' item = p.sub('', item).strip() p = re.compile('<h2 class="b-translation__group-title" id="([^"]+)">(.*?)</h2>') m = p.search(item) if m: morpho_id = m.group(1) morpho_title = m.group(2) # print morpho_id, morpho_title item = p.sub('', item).strip() m = re.match('<ol class="b-translation__entries[^"]*">(.*)</ol>(.*)</div>', item) if not m: print 'x' * 100 print filename print item print lines = m.group(1).strip() tail = m.group(2).strip() if tail and not re.match('<div class="b-translation__rejex">(.*)</div>', tail): print 'x' * 100 print tail lines = re.findall('<li class="b-translation__entry">(.*?)</li>', lines) for line in lines: m = re.match('^<div class="b-translation__translation"><span class="b-translation__translation-words">(.*)</span></div>(?:<div class="b-translation__examples">(.*)</div>)?$', line) if not m: print 'x' * 100 print line continue line = m.group(1) examples = m.group(2) if examples: # print examples # print pass line = remove_span(line) p = re.compile('<a class="b-translation__link ajax ajax-full" href="[^"]+">([^<]+)</a>') line = p.sub(r'[\1]', line) line = line.replace(u'¸', u'ё') line = line.replace(u'<sup class="b-translation__src-num"></sup>', '') # print filename # print line # print continue # ok if re.match(u'^<div class="b-translation i-bem" onclick="[^"]+"><div class="b-nothing-found"><div class="b-nothing-found__message">К сожалению, мы ничего не нашли\.</div>Попробуйте поискать <a class="b-link" href="[^"]+">в энциклопедиях</a>\.</div></div>$', content): save_file(new_filename, '', encode='utf-8') continue # not found if re.match(u'^<div class="b-translation i-bem" onclick="[^"]+"><div class="b-nothing-found"><div class="b-nothing-found__failure">Похоже, что-то пошло не так. Пожалуйста, попробуйте обновить страницу.</div></div></div>$', content): # print filename old_filename = join(YANDEX_SLOVARI_UK_RU_PATH, filename) if exists(old_filename): os.rename(old_filename, join(YANDEX_SLOVARI_PATH, 'uk-ru-bad', filename)) print u"Файл перемещён: ", filename continue # bad print content print
# for label in G.labels_data: # # content = get_wiki_page_content(u'Шаблон:%s' % label) # content = get_wiki_page(u'Шаблон:%s' % label).get(get_redirect=True) # print label # save_file(path + label.replace(':', '_').replace('/', ' - ') + ".tpl", # content.encode('utf-8')) # sys.exit() files = os.listdir(path) results = {} for file in files: filename = path + file if os.path.isdir(filename): continue content = load_file(filename) file = file.decode('cp1251') # print file content = content.decode('utf-8') name = file[:-4] if name in G.labels_data_redirects_keys: continue vars = re.findall(u'\{\{\{([^|{}]+)', content) # for var in vars: # print '-', var results[name] = vars for key, value in G.labels_data_redirects.items(): results[key] = results[value]