Exemple #1
0
 def save_step(self):
     if os.path.exists('stop'):
         print u'Stopping command because "stop" file exists.'
         self.stop()
     if not self.last_action_filename:
         raise NotImplementedError()
     # print '@', self.current_datetime, '-', "type: ", \
     #     self.last_action_filename
     if os.path.exists(self.state_filename):
         # print '# File exists'
         content = load_file(self.state_filename)
         last_action = datetime.strptime(content, self.date_format)
         # print last_action
         # print datetime.now()
         delta = datetime.now() - last_action
         # print delta
         if delta < timedelta(seconds=5):
             # print '<<<'
             # print
             return
     # save_file(self.last_action_filename, self.current_datetime)
     try:
         # print 'SAVED!!!', '@' * 100
         save_file(self.state_filename, self.current_datetime)
         append_file(u'files/logs/%s-s' % self.last_action_filename,
                     u'%s - %s' % (self.current_datetime, self.i))
     except IOError:
         # pass
         # print
         print u'### IOError', '#' * 200
Exemple #2
0
def parse_parsed_left():
    path = join(YANDEX_SLOVARI_UK_RU_PARSE_PATH, 'left')
    files = os.listdir(path)
    for filename in files:
        filename = filename.decode('cp1251')
        # print filename
        content = load_file(join(path, filename), decode='utf-8')
        # print content
        items = re.findall('<li class="b-translation-review__item b-translation-review__current">(.*?)</li>',
                           content, flags=re.DOTALL)
        if items:
            saved = False
            for item in items:
                m = re.match('<h3 class="b-translation-review__language">'
                             '<a class="b-translation-review__link ajax ajax-full" '
                             'href="[^"]+" onclick="[^"]+">([^<]+)</a>'
                             '</h3>'
                             '<span class="b-translation-review__translation">(.+?)</span>',
                             item)
                if not m:
                    print '#' * 100
                    print filename
                    print item
                lang = m.group(1)
                value = m.group(2)
                if lang != u'с украинского':
                    print 'x' * 100
                    print lang
                    print value
                    print
                if '[' in value:
                    print 'v' * 100
                if '\n' in value:
                    print 'n' * 100
                p = re.compile('<a class="b-translation-review__link ajax ajax-full" href="[^"]+" onclick="[^"]+">([^<]+)</a>')
                m = p.match(value)
                if m:
                    check_word = m.group(1)
                    if check_word == filename:
                        print '$' * 100
                    value = p.sub(r'[\1]', value)
                    # print filename
                    # print check_word
                    # print repr(filename)
                    # print repr(check_word)
                    # print value
                    # print
                value = value.replace(u'¸', u'ё')
                if saved:
                    print filename, 'ALREADY SAVED', value
                # save_file(join(YANDEX_SLOVARI_UK_RU_PARSE_PATH, 'left.parsed', filename),
                #           value, encode='utf-8')
                # print value
                saved = True
Exemple #3
0
    def update(self, page, value, log=False, edited=None, is_content=False,
               content_with_utf8mb4=None):
        if log and not is_content or not log and is_content:
            raise Exception('log and is_content should be both set or unset')
        if not self.field_name:
            return
        obj, created = self.get_or_create(page=page)
        changed = getattr(obj, self.field_name) != value
        if is_content:
            #folder = "%sx" % (page.id / 10000)
            #path = os.path.join(settings.FILES_PATH, 'pages', folder)
            #if not os.path.exists(path):
            #    os.mkdir(path)
            #filename = os.path.join(path, "%s" % page.id)
            #old_content = ''
            #exist = os.path.exists(filename)
            #if exist:
            #    old_content = load_file(filename, decode='utf-8')

            block_num = page.id / 1000
            filename = "%sx" % block_num
            path = os.path.join(settings.CONTENT_FILES_PATH, filename)
            if os.path.exists(path):
                contents = load_file(path, decode='utf-8').split(G.separator)
            else:
                contents = [''] * 1000
            old_content = contents[page.id % 1000]

            new_content = content_with_utf8mb4
            page.cached_content = new_content
            if old_content != new_content:
                contents[page.id % 1000] = new_content
                save_file(path, G.separator.join(contents), encode='utf-8')
                print '           $ pk=%d: content updated (file)' % page.id
                print >> sys.stderr, '... Updated at %s: %s  // pk=%d' % \
                                     (edited, log, obj.pk)
        if changed:
            self.updating(obj, value)
            if log:
                print >> sys.stderr, '::: Updated at %s: %s  // pk=%d' % \
                                     (edited, log, obj.pk)
Exemple #4
0
def parse_htmls():
    path = YANDEX_SLOVARI_UK_RU_PATH
    files = os.listdir(path)
    for filename in files:
        filename = filename.decode('cp1251')
        print filename
        content = load_file(join(path, filename), decode='utf-8')
        # print content
        for block in ['left', 'center', 'right']:
            text = re.search('<td class="l-page__%s">(.*?)</td>' % block,
                             content, flags=re.DOTALL).group(1)
            text = re.sub('<div id="b-direct-(bottom|right)"[^>]*></div>', '',
                          text, flags=re.DOTALL)
            text = re.sub('<div class="b-translation__controls">.*?</div>', '',
                          text, flags=re.DOTALL)
            text = re.sub('<i class="b-statcounter">.*</i>', '', text,
                          flags=re.DOTALL)
            new_filename = join(YANDEX_SLOVARI_UK_RU_PARSE_PATH, block,
                                filename)
            # if not exists(new_filename):
            save_file(new_filename, text.strip(), encode='utf-8')
Exemple #5
0
 def iterate(self, chunksize=1000, prefetch=None, my_last_pk=None,
             show_message=True, start_pk=0):
     pk = start_pk
     if my_last_pk:
         last_pk = my_last_pk
     else:
         last_pk = self.order_by('-pk')[0].pk
     if show_message:
         print dt(), last_pk
     queryset = self.order_by('pk')
     # i = 1
     if settings.CONTENT_IN_FILES:
         pk = 0
     while pk < last_pk:
         if settings.CONTENT_IN_FILES:
             items = queryset.filter(pk__gte=pk)
         else:
             items = queryset.filter(pk__gt=pk)
         if prefetch:
             items = items.prefetch_related(*prefetch)
         if settings.CONTENT_IN_FILES:
             block_num = pk / 1000
             filename = "%sx" % block_num
             path = os.path.join(settings.CONTENT_FILES_PATH, filename)
             contents = load_file(path, decode='utf-8').split(G.separator)
             for row in items[:chunksize]:
                 if row.pk >= pk + 1000:
                     break
                 row.cached_content = contents[row.pk % 1000]
                 # if row.cached_content != row.page_content.content:
                 #     raise Exception('!!!')
                 yield row
         else:
             for row in items[:chunksize]:
                 pk = row.pk
                 yield row
         if settings.CONTENT_IN_FILES:
             pk += 1000
         gc.collect()
Exemple #6
0
 def content(self):
     if settings.CONTENT_IN_FILES:
         if hasattr(self, 'cached_content'):
             return self.cached_content
         else:
             block_num = self.id / 1000
             filename = "%sx" % block_num
             path = os.path.join(settings.CONTENT_FILES_PATH, filename)
             if not os.path.exists(path):
                 raise Exception('Content file for page %s not found.' %
                                 self.id)
             contents = load_file(path, decode='utf-8').split(G.separator)
             self.cached_content = contents[self.id % 1000]
             return self.cached_content
         # folder = "%sx" % (self.id / 10000)
         # path = os.path.join('d:/!!!', 'pages', folder)
         # if not os.path.exists(path):
         #     raise Exception('Content file for page %s not found.' % self.id)
         # filename = os.path.join(path, "%s" % self.id)
         # if not os.path.exists(filename):
         #     raise Exception('Content file for page %s not found.' % self.id)
         # return load_file(filename, decode='utf-8')
     return self.page_content.content
Exemple #7
0
 def check_for_start(self):
     if not self.last_action_filename:
         raise NotImplementedError()
     last_action = datetime.now()  # default value for email
     delta = 'None'
     if os.path.exists(self.state_filename):
         content = load_file(self.state_filename)
         last_action = datetime.strptime(content, self.date_format)
         delta = datetime.now() - last_action
         if delta < timedelta(minutes=3) and not self.skip_mail:
             print "Don't start yet: " + self.last_action_filename
             send_wiki_mail(
                 subject="Don't start yet: " + self.last_action_filename,
                 message="%s // %s" % (last_action, delta)
             )
             raise Exception("Don't start yet!!")
     print "Successfully started: " + self.last_action_filename
     send_wiki_mail(
         subject="Successfully started: " + self.last_action_filename,
         message="%s // %s" % (last_action, delta)
     )
     append_file('files/timing',
                 u'%s > %s' % (self.current_datetime_Kyiv,
                               self.last_action_filename))
Exemple #8
0
def parse_parsed_center():
    path = join(YANDEX_SLOVARI_UK_RU_PARSE_PATH, 'center')
    files = os.listdir(path)
    for filename in files:
        filename = filename.decode('cp1251')
        new_filename = join(YANDEX_SLOVARI_UK_RU_PARSE_PATH, 'center.parsed', filename)
        # print filename
        content = load_file(join(path, filename), decode='utf-8')
        # print content
        p = re.compile('<div class="b-did-you-mean b-did-you-mean_margin_yes">(.+?)</div>', flags=re.DOTALL)
        m = p.search(content)
        if m:
            # print filename
            value = m.group(1)
            content = p.sub('', content)
            m = re.match(u'<span class="b-did-you-mean__content">Быть может, вы искали: «<a class="ajax ajax-full" href="/[^/]+/uk-ru"><strong>([^>]+)</strong></a>»\?</span>', value)
            if not m:
                m = re.match(u'<span class="b-did-you-mean__content">Быть может, вы искали одно из этих слов: («<a class="ajax ajax-full" href="/[^/]+/uk-ru"><strong>([^>]+)</strong></a>»(, )?)+\?</span>', value)
                if not m:
                    print 'x' * 100
                    print value
            save_file(new_filename, '?', encode='utf-8')
            continue # wrong
        p = re.compile(u'<div class="b-misspell">В запросе «<strong>(.*?)</strong>» исправлена опечатка</div>')
        m = p.search(content)
        if m:
            misspell = m.group(1)
            misspell = re.sub('</?em>', '', misspell)
            if misspell != filename:
                print 'x' * 100
                print misspell
                print filename
            content = p.sub('', content).strip()
        p = re.compile(u'<h6 class="b-translation__fulltext-link">Переведено с украинского на русский<img src="[^"]+"> по технологии «<a target="_blank" href="[^"]+">Яндекс.Перевод</a>».</h6><p class="b-translation__disclaimer">Пожалуйста, пользуйтесь результатом машинного перевода с осторожностью, возможны ошибки.</p>')
        m = p.search(content)
        if m:
            content = p.sub('', content)
            m = re.match('^<div class="b-translation i-bem" onclick="[^"]+">\s*<div class="b-translation__fulltext"><p>([^>]+)</p></div>\s*</div>$', content)
            if not m:
                print filename
                print repr(filename)
                print content
                print
                # pass
            translate = m.group(1)
            save_file(new_filename, 'translated: %s' % translate, encode='utf-8')
            # print filename
            # print translate
            # print
            continue  # ok
        p = re.compile(u'^<div class="b-translation b-translation_type_example i-bem" onclick="[^"]+">\s*<div class="b-translation__group"><div class="b-translation__group-line"><span class="b-translation__group-title">Найдено в примерах</span></div>\</div>(.*)\s*</div>$', flags=re.DOTALL)
        m = p.match(content)
        if m:
            content = m.group(1).strip()
            # print filename
            # m = re.match(u'^(<h1 class="b-translation__example-title">(.*?)</h1><div class="b-translation__example">(.*?)</div><div class="b-translation__example-source">.*</div>)+$', content)
            # if not m:
            #     print content
            items = re.findall(u'<h1 class="b-translation__example-title">(.*?)</h1><div class="b-translation__example">(.*?)</div><div class="b-translation__example-source">(.*?)</div>', content)
            for title, example, source in items:
                title = remove_span(title)
                example = remove_span(example)
                source = remove_span(source)
                # print title
                # print example
                # print source
                # print
                if not re.match(u'^(Русско-украинский|Украинско-русский) словарь\. ИТФ «Перун» › .*', source):
                    print 'x' * 100
                    print source
            continue  # ok
        p = re.compile(u'^<div class="b-translation i-bem" onclick="[^"]+">\s*<div class="b-translation__card b-translation__card_examples_three"><h1 class="b-translation__title"><span class="b-translation__text">(.*?)</span>(?: <img class="b-icon b-icon_type_audio-big" src="[^"]+" alt="Прослушать" title="Прослушать" onclick="[^"]+">)?</h1>(.*)</div>\s*</div>$')
        m = p.match(content)
        if m:
            title = m.group(1)
            content = m.group(2).strip()
            # if title != filename:
            #     print filename
            #     print title
            #     print
            # m = re.match('(<div class="b-translation__group[^"]*">(.*?)</div>)+', content)
            # if not m:
            #     print content
            # if re.search('<div class="b-translation__group">', content):
                # print filename
                # print content
                # print
                # pass
            # print content
            items = re.split('<div class="b-translation__group[^"]*">', content)
            for item in items:
                if not item:
                    continue
                # print item
                # print
                p = re.compile(u'<div class="b-translation__grammar-note">(с|ж)</div>')
                m = p.search(item)
                if m:
                    # print 'ok'
                    item = p.sub('', item).strip()
                p = re.compile('<h2 class="b-translation__group-title" id="([^"]+)">(.*?)</h2>')
                m = p.search(item)
                if m:
                    morpho_id = m.group(1)
                    morpho_title = m.group(2)
                    # print morpho_id, morpho_title
                    item = p.sub('', item).strip()
                m = re.match('<ol class="b-translation__entries[^"]*">(.*)</ol>(.*)</div>', item)
                if not m:
                    print 'x' * 100
                    print filename
                    print item
                    print
                lines = m.group(1).strip()
                tail = m.group(2).strip()
                if tail and not re.match('<div class="b-translation__rejex">(.*)</div>', tail):
                    print 'x' * 100
                    print tail
                lines = re.findall('<li class="b-translation__entry">(.*?)</li>', lines)
                for line in lines:
                    m = re.match('^<div class="b-translation__translation"><span class="b-translation__translation-words">(.*)</span></div>(?:<div class="b-translation__examples">(.*)</div>)?$', line)
                    if not m:
                        print 'x' * 100
                        print line
                        continue
                    line = m.group(1)
                    examples = m.group(2)
                    if examples:
                        # print examples
                        # print
                        pass
                    line = remove_span(line)
                    p = re.compile('<a class="b-translation__link ajax ajax-full" href="[^"]+">([^<]+)</a>')
                    line = p.sub(r'[\1]', line)
                    line = line.replace(u'¸', u'ё')
                    line = line.replace(u'<sup class="b-translation__src-num"></sup>', '')
                    # print filename
                    # print line
                    # print
            continue  # ok
        if re.match(u'^<div class="b-translation i-bem" onclick="[^"]+"><div class="b-nothing-found"><div class="b-nothing-found__message">К сожалению, мы ничего не нашли\.</div>Попробуйте поискать <a class="b-link" href="[^"]+">в энциклопедиях</a>\.</div></div>$', content):
            save_file(new_filename, '', encode='utf-8')
            continue  # not found
        if re.match(u'^<div class="b-translation i-bem" onclick="[^"]+"><div class="b-nothing-found"><div class="b-nothing-found__failure">Похоже, что-то пошло не так. Пожалуйста, попробуйте обновить страницу.</div></div></div>$', content):
            # print filename
            old_filename = join(YANDEX_SLOVARI_UK_RU_PATH, filename)
            if exists(old_filename):
                os.rename(old_filename,
                          join(YANDEX_SLOVARI_PATH, 'uk-ru-bad', filename))
                print u"Файл перемещён: ", filename
            continue  # bad
        print content
        print
Exemple #9
0
# for label in G.labels_data:
#     # content = get_wiki_page_content(u'Шаблон:%s' % label)
#     content = get_wiki_page(u'Шаблон:%s' % label).get(get_redirect=True)
#     print label
#     save_file(path + label.replace(':', '_').replace('/', ' - ') + ".tpl",
#               content.encode('utf-8'))
# sys.exit()

files = os.listdir(path)

results = {}
for file in files:
    filename = path + file
    if os.path.isdir(filename):
        continue
    content = load_file(filename)
    file = file.decode('cp1251')
    # print file
    content = content.decode('utf-8')
    name = file[:-4]
    if name in G.labels_data_redirects_keys:
        continue
    vars = re.findall(u'\{\{\{([^|{}]+)', content)
    # for var in vars:
    #     print '-', var
    results[name] = vars

for key, value in G.labels_data_redirects.items():
    results[key] = results[value]