Exemple #1
0
 def save_step(self):
     if os.path.exists('stop'):
         print u'Stopping command because "stop" file exists.'
         self.stop()
     if not self.last_action_filename:
         raise NotImplementedError()
     # print '@', self.current_datetime, '-', "type: ", \
     #     self.last_action_filename
     if os.path.exists(self.state_filename):
         # print '# File exists'
         content = load_file(self.state_filename)
         last_action = datetime.strptime(content, self.date_format)
         # print last_action
         # print datetime.now()
         delta = datetime.now() - last_action
         # print delta
         if delta < timedelta(seconds=5):
             # print '<<<'
             # print
             return
     # save_file(self.last_action_filename, self.current_datetime)
     try:
         # print 'SAVED!!!', '@' * 100
         save_file(self.state_filename, self.current_datetime)
         append_file(u'files/logs/%s-s' % self.last_action_filename,
                     u'%s - %s' % (self.current_datetime, self.i))
     except IOError:
         # pass
         # print
         print u'### IOError', '#' * 200
def save_db_to_file():
    i = 0
    print dt(), i
    words = dict()
    ## for item in Word_Description.objects.all()[:10000]:  # .prefetch_related('details')
    ## for item in Word_Description.objects.prefetch_related('word', 'word__details').all()[:10000]:
    # for item in Word_Description.objects.iterate(100000):
    output = list()
    for item in Word_Value.objects.iterate(100000):
    #for item in Word_Description.objects.iterate(100000, ['word', 'word__details']):
    ## for item in Word_Description.objects.iterate(100):
        # word = "%s " % item.value
    #    word = "%s %s" % (item.value, item.word.details.in_academic_lopatin)
    #    # print word
    #     if item.dictionary_id in [12, 13, 16]:
    #         continue
        i += 1
        # print repr(item.value)
        output.append("%s|%s" % (item.value, item.dictionary_id))
        if not i % 10000:
            print dt(), i
            # break
        # if i > 300000:
        #     break

    save_file('db_data.txt', '\n'.join(output), 'utf-8')
Exemple #3
0
 def after(self):
     result = '<div class="index">\n'
     for key in sorted(self.groups.keys()):
         result += '\n== %s ==\n' % key
         for value in sorted(self.groups[key], key=lambda x: x[::-1]):
             result += '* [[%s]]\n' % value
     result += '\n</div>\n'
     save_file(join(settings.DATA_PATH, 'сущ ru words.txt'), result)
Exemple #4
0
 def content_action(self, page, content, **kwargs):
     if self.i < 1050000:  # or self.i > 950000:
         return
     folder = "%sx" % (page.id / 10000)
     path = os.path.join(settings.FILES_PATH, 'pages', folder)
     if not os.path.exists(path):
         os.mkdir(path)
     filename = os.path.join(path, "%s" % page.id)
     save_file(filename, content, encode='utf-8')
     if not self.i % 100:
         time.sleep(2)
Exemple #5
0
 def content_action(self, page, content, **kwargs):
     # if re.search(u'\n[^\n]+= *\{\{-ru-', content):
     #     print page.title.strip(), '@'
     # if len(re.findall(u'\{\{-ru-[|}]', content)) > 1:
     #     print page.title.strip(), '@'
     # if u'{{semantcat' in content:
     #     save_file(page.title.replace(':', '_').replace('/', ' - ') + ".txt",
     #               content.encode('utf-8'))
     if u'semantcat' in page.title:
         save_file(page.title.replace(':', '_').replace('/', ' - ') + ".tpl",
                   content.encode('utf-8'))
Exemple #6
0
 def after(self):
     WordInflection.objects.bulk_add()
     result = u'<div class="index">\n'
     for key in sorted(self.groups.keys()):
         values = sorted(self.groups[key], key=lambda x: x[::-1])
         result += u'\n== <nowiki>%s</nowiki> (%d) ==\n' % \
                   (key, len(values))
         for value in values:
             result += u'* [[%s]]\n' % value
     result += u'\n</div>\n'
     save_file(join(settings.DATA_PATH, u'сущ ru words.txt'), result,
               encode='utf-8')
def analyze_words(words):
    result = list()

    for word, data in words.items():
        dicts_list = list()
        for key, dicts in data.items():
            # dicts_list += dicts
            # for d in dicts:
            #     dicts_list.append(int(d))
            for d in dicts:
                dicts_list.append(d.strip())
        dicts_text = ','.join(dicts_list)
        # dicts_text = convert_to_dicts_template(dicts_list)
        if len(data) == 0:
            print '[0]', word
        # if len(data) == 1:
        #     if word.upper() in data:
        #         print '[1U]', word.upper(), '-', dicts_text
        #     elif word.capitalize() in data:
        #         print '[1C]', word.capitalize(), '-', dicts_text
        if len(data) > 1:
            if word.capitalize() in data:
                if word.lower() in data or word.upper() in data:
                    if set(data[word.capitalize()]) == {'1'}:
                        print
                        print '[CC]', word.capitalize(), '-', dicts_text
                        for key, dicts in data.items():
                            print '   -', key, '-', ','.join(dicts)
                        print 'ok'
                        # [CC] Дэу - 1,1
                        #    - Дэу - 1
                        #    - дэу - 1

                        # Сложный случай (есть оба!!)
                        # [CC] Дуся - 1,2,4,3
                        #    - Дуся - 1
                        #    - дуся - 2,4,3

        # если только в 6 маленькая - значит искать "[с прописной буквы]"
        # если в 1 - большая, а в 2 - маленькая, значит маленькая!
        # если только 1 и 6 - "Хэллоуин", "Мнемосина"
        # если только 8, то возможно он неадекватный?? "израилев"

        # if len(data) == 3:
        #     print '[3]', word, '-', dicts_text
        # if len(data) > 3:
        #     print '[*]', word, '-', dicts_text
        result.append("%s|%s" % (word, dicts_text))

    save_file('total_list_of_words.txt', '\n'.join(result))
def create_file():
    words = []
    category = u'Русский_язык'
    path = join(categories_path, category)
    files = os.listdir(path)
    for filename in files:
        lines = open(join(path, filename)).read().split('\n')
        for line in lines:
            words.append(line)
    locale.setlocale(locale.LC_ALL)
    words = [word.decode('utf-8') for word in words]
    words = sorted(words, cmp=locale.strcoll)
    save_file(join(path, '..', 'wiktionary-russian-2013-08-02-text.txt'), "\n".join(words), 'utf-8')
    words2 = ["# [[%s]]" % word for word in words]
    save_file(join(path, '..', 'wiktionary-russian-2013-08-02-wiki.txt'), "\n".join(words2), 'utf-8')
Exemple #9
0
def analyze_category(title):
    queue = [title]
    while queue:
        title = queue.pop()
        print u"→", title
        if re.search(r'[\\"?*|<>]', title):
            print " ×", title, "—", "bad symbols in title"
            continue
        if title in processed_titles:
            print " ×", title, "—", "already used"
            continue
        lang_skipping = re.match(u".*(/[-a-z]{2,8}|по языкам)$", title)
        # if lang_skipping:
        #     print ' ×', title, '—', 'lang: skipping'
        #     continue
        processed_titles.append(title)
        file_title = title.replace("/", "#").replace(":", "%")
        if lang_skipping:
            dirname = lang_skipping.group(1).replace("/", "")
            skip_path = join(categories_path, "#", dirname)
            if not exists(skip_path):
                os.mkdir(skip_path)
            filename = join(skip_path, file_title)
        else:
            filename = join(categories_path, file_title)
        # complete_filename = join(categories_path, 'complete', file_title)
        # blocked_filename = join(categories_path, 'blocked', file_title)
        # if exists(complete_filename):
        #     # print u' ×', 'already exist'
        #     return
        # if exists(blocked_filename):
        #     print u' ×', title, '—', 'blocked'
        #     return
        if exists(filename):
            # print u' ←', 'exist, reading'
            base_titles, sub_titles = read_titles(filename)
        else:
            category = get_category("%s%s" % (category_prefix, title))
            base_titles = process_categories(category.categories())
            sub_titles = process_categories(category.subcategories())
            base_content = "\n".join(map(lambda x: "< %s" % x, base_titles)) or "-"
            sub_content = "\n".join(map(lambda x: "> %s" % x, sub_titles)) or "-"
            content = "%s\n\n%s\n" % (base_content, sub_content)
            save_file(filename, content, encode="utf-8")
            print u" +", title, "—", "saved"
        for sub_title in base_titles + sub_titles:
            if sub_title not in processed_titles:
                queue.append(sub_title)
Exemple #10
0
def download_pages():
    category = u'Русский_язык'
    print category
    print '=' * 10
    path = join(categories_path, category)
    if not exists(path):
        os.mkdir(path)
    p = 0
    url = get_category_page_url(category)
    while True:
        print dt(), 'Page', p
        content = download(url)
        url, content = extract_pages_and_next_link(content)
        url = 'http://ru.wiktionary.org%s' % url
        filename = join(path, 'p%d' % p)
        save_file(filename, content, 'utf-8')
        p += 1
        time.sleep(0.1)
Exemple #11
0
 def handle(self, *args, **options):
     if not self.name:
         raise NotImplementedError()
     append_file('files/timing', u'%s @ %s' % (self.current_datetime, self.name))
     print '@', self.current_datetime, '-', "Starting wikt command:", \
         self.name
     is_started = os.path.exists(self.state_filename)
     if is_started:
         print '# Command already started'
         if not self.skip_mail:
             send_wiki_mail(
                 subject="Wikt command already started: " + self.name,
                 message=open(self.state_filename).read()
             )
         return
     save_file(self.state_filename,
               "Started at " + self.current_datetime)
     self.wikt_handle(*args, **options)
     os.remove(self.state_filename)
     append_file('files/timing', u'%s . %s' % (self.current_datetime, self.name))
Exemple #12
0
    def update(self, page, value, log=False, edited=None, is_content=False,
               content_with_utf8mb4=None):
        if log and not is_content or not log and is_content:
            raise Exception('log and is_content should be both set or unset')
        if not self.field_name:
            return
        obj, created = self.get_or_create(page=page)
        changed = getattr(obj, self.field_name) != value
        if is_content:
            #folder = "%sx" % (page.id / 10000)
            #path = os.path.join(settings.FILES_PATH, 'pages', folder)
            #if not os.path.exists(path):
            #    os.mkdir(path)
            #filename = os.path.join(path, "%s" % page.id)
            #old_content = ''
            #exist = os.path.exists(filename)
            #if exist:
            #    old_content = load_file(filename, decode='utf-8')

            block_num = page.id / 1000
            filename = "%sx" % block_num
            path = os.path.join(settings.CONTENT_FILES_PATH, filename)
            if os.path.exists(path):
                contents = load_file(path, decode='utf-8').split(G.separator)
            else:
                contents = [''] * 1000
            old_content = contents[page.id % 1000]

            new_content = content_with_utf8mb4
            page.cached_content = new_content
            if old_content != new_content:
                contents[page.id % 1000] = new_content
                save_file(path, G.separator.join(contents), encode='utf-8')
                print '           $ pk=%d: content updated (file)' % page.id
                print >> sys.stderr, '... Updated at %s: %s  // pk=%d' % \
                                     (edited, log, obj.pk)
        if changed:
            self.updating(obj, value)
            if log:
                print >> sys.stderr, '::: Updated at %s: %s  // pk=%d' % \
                                     (edited, log, obj.pk)
Exemple #13
0
def parse_htmls():
    path = YANDEX_SLOVARI_UK_RU_PATH
    files = os.listdir(path)
    for filename in files:
        filename = filename.decode('cp1251')
        print filename
        content = load_file(join(path, filename), decode='utf-8')
        # print content
        for block in ['left', 'center', 'right']:
            text = re.search('<td class="l-page__%s">(.*?)</td>' % block,
                             content, flags=re.DOTALL).group(1)
            text = re.sub('<div id="b-direct-(bottom|right)"[^>]*></div>', '',
                          text, flags=re.DOTALL)
            text = re.sub('<div class="b-translation__controls">.*?</div>', '',
                          text, flags=re.DOTALL)
            text = re.sub('<i class="b-statcounter">.*</i>', '', text,
                          flags=re.DOTALL)
            new_filename = join(YANDEX_SLOVARI_UK_RU_PARSE_PATH, block,
                                filename)
            # if not exists(new_filename):
            save_file(new_filename, text.strip(), encode='utf-8')
Exemple #14
0
def display_all_codes(letter):
    prefetch = ['en_wiki', 'equals', 'wikt_en', 'wikt_en__en_wiki',
                'wikt_en__en_wiki__redirect_to']
    items = LangCode.objects.filter(code__startswith=letter).prefetch_related(*prefetch).order_by('code')
    for lang in items:
        code = lang.code
        # print code
        lang.en_items = []
        for item in LangWikiEnIso1.objects.filter(code=lang):
            lang.en_items.append(item)
        for item in LangWikiEnIso3.objects.filter(code=lang):
            lang.en_items.append(item)
        lang.ru_items = []
        for item in LangWikiRuIso1.objects.filter(code=lang):
            lang.ru_items.append(item)
        for item in LangWikiRuIso3.objects.filter(code=lang):
            lang.ru_items.append(item)
    content = render_to_string("wiki/iso_langs.html", {'items': items,
                                                       'letter': letter})
    filename = os.path.join(settings.ROOT_PATH, 'wikt', 'data',
                            'langs_%s.txt' % letter)
    save_file(filename, content.replace('\r', ''), encode='utf-8')
    return content
Exemple #15
0
def download(slug, imin, imax, step=1, subdomain=False, missed=False):
    if subdomain:  # actually don't needed
        url = 'http://%s.academic.ru/%%s/' % slug
    else:
        url = 'http://dic.academic.ru/dic.nsf/%s/%%s/' % slug
    if not missed:
        path_download = join(BASE_PATH, slug)
        path_out = None
        path_empty = None
    else:  # if ids have holes
        if subdomain:
            path_out = 'c:/download/%s.academic.ru.out' % slug
            path_download = 'c:/download/%s.academic.ru.download' % slug
            path_empty = ''
        else:
            path_out = join(BASE_PATH, '%s.words.out' % slug)
            path_download = join(BASE_PATH, '%s.words.download' % slug)
            path_empty = join(BASE_PATH, '%s.words.download.empty' % slug)
    if not exists(path_download):
        os.mkdir(path_download)
    for i in range(imin, imax, step):
        name = str(i)
        dt = datetime.now().strftime("[%H:%M:%S]")
        sslug = "(%s)" % slug
        filename = join(path_download, name)
        if exists(filename):
            print dt, name, sslug, '+++'
        elif path_out and exists(join(path_out, name)):
            print dt, name, sslug, '+'
        elif path_empty and exists(join(path_empty, name)):
            print dt, name, sslug, '---'
        else:
            print dt, name, sslug, 'downloading'
            content = urllib.urlopen(url % name).read().replace('\r', '')
            save_file(filename, content)
            time.sleep(0.1)
Exemple #16
0
 def after(self):
     super(SlogiReplacer, self).after()
     self.content += "|-\n|}"
     save_file(join(settings.DATA_PATH, 'slogi.txt'), self.content,
               encode='utf-8')
Exemple #17
0
def parse_parsed_center():
    path = join(YANDEX_SLOVARI_UK_RU_PARSE_PATH, 'center')
    files = os.listdir(path)
    for filename in files:
        filename = filename.decode('cp1251')
        new_filename = join(YANDEX_SLOVARI_UK_RU_PARSE_PATH, 'center.parsed', filename)
        # print filename
        content = load_file(join(path, filename), decode='utf-8')
        # print content
        p = re.compile('<div class="b-did-you-mean b-did-you-mean_margin_yes">(.+?)</div>', flags=re.DOTALL)
        m = p.search(content)
        if m:
            # print filename
            value = m.group(1)
            content = p.sub('', content)
            m = re.match(u'<span class="b-did-you-mean__content">Быть может, вы искали: «<a class="ajax ajax-full" href="/[^/]+/uk-ru"><strong>([^>]+)</strong></a>»\?</span>', value)
            if not m:
                m = re.match(u'<span class="b-did-you-mean__content">Быть может, вы искали одно из этих слов: («<a class="ajax ajax-full" href="/[^/]+/uk-ru"><strong>([^>]+)</strong></a>»(, )?)+\?</span>', value)
                if not m:
                    print 'x' * 100
                    print value
            save_file(new_filename, '?', encode='utf-8')
            continue # wrong
        p = re.compile(u'<div class="b-misspell">В запросе «<strong>(.*?)</strong>» исправлена опечатка</div>')
        m = p.search(content)
        if m:
            misspell = m.group(1)
            misspell = re.sub('</?em>', '', misspell)
            if misspell != filename:
                print 'x' * 100
                print misspell
                print filename
            content = p.sub('', content).strip()
        p = re.compile(u'<h6 class="b-translation__fulltext-link">Переведено с украинского на русский<img src="[^"]+"> по технологии «<a target="_blank" href="[^"]+">Яндекс.Перевод</a>».</h6><p class="b-translation__disclaimer">Пожалуйста, пользуйтесь результатом машинного перевода с осторожностью, возможны ошибки.</p>')
        m = p.search(content)
        if m:
            content = p.sub('', content)
            m = re.match('^<div class="b-translation i-bem" onclick="[^"]+">\s*<div class="b-translation__fulltext"><p>([^>]+)</p></div>\s*</div>$', content)
            if not m:
                print filename
                print repr(filename)
                print content
                print
                # pass
            translate = m.group(1)
            save_file(new_filename, 'translated: %s' % translate, encode='utf-8')
            # print filename
            # print translate
            # print
            continue  # ok
        p = re.compile(u'^<div class="b-translation b-translation_type_example i-bem" onclick="[^"]+">\s*<div class="b-translation__group"><div class="b-translation__group-line"><span class="b-translation__group-title">Найдено в примерах</span></div>\</div>(.*)\s*</div>$', flags=re.DOTALL)
        m = p.match(content)
        if m:
            content = m.group(1).strip()
            # print filename
            # m = re.match(u'^(<h1 class="b-translation__example-title">(.*?)</h1><div class="b-translation__example">(.*?)</div><div class="b-translation__example-source">.*</div>)+$', content)
            # if not m:
            #     print content
            items = re.findall(u'<h1 class="b-translation__example-title">(.*?)</h1><div class="b-translation__example">(.*?)</div><div class="b-translation__example-source">(.*?)</div>', content)
            for title, example, source in items:
                title = remove_span(title)
                example = remove_span(example)
                source = remove_span(source)
                # print title
                # print example
                # print source
                # print
                if not re.match(u'^(Русско-украинский|Украинско-русский) словарь\. ИТФ «Перун» › .*', source):
                    print 'x' * 100
                    print source
            continue  # ok
        p = re.compile(u'^<div class="b-translation i-bem" onclick="[^"]+">\s*<div class="b-translation__card b-translation__card_examples_three"><h1 class="b-translation__title"><span class="b-translation__text">(.*?)</span>(?: <img class="b-icon b-icon_type_audio-big" src="[^"]+" alt="Прослушать" title="Прослушать" onclick="[^"]+">)?</h1>(.*)</div>\s*</div>$')
        m = p.match(content)
        if m:
            title = m.group(1)
            content = m.group(2).strip()
            # if title != filename:
            #     print filename
            #     print title
            #     print
            # m = re.match('(<div class="b-translation__group[^"]*">(.*?)</div>)+', content)
            # if not m:
            #     print content
            # if re.search('<div class="b-translation__group">', content):
                # print filename
                # print content
                # print
                # pass
            # print content
            items = re.split('<div class="b-translation__group[^"]*">', content)
            for item in items:
                if not item:
                    continue
                # print item
                # print
                p = re.compile(u'<div class="b-translation__grammar-note">(с|ж)</div>')
                m = p.search(item)
                if m:
                    # print 'ok'
                    item = p.sub('', item).strip()
                p = re.compile('<h2 class="b-translation__group-title" id="([^"]+)">(.*?)</h2>')
                m = p.search(item)
                if m:
                    morpho_id = m.group(1)
                    morpho_title = m.group(2)
                    # print morpho_id, morpho_title
                    item = p.sub('', item).strip()
                m = re.match('<ol class="b-translation__entries[^"]*">(.*)</ol>(.*)</div>', item)
                if not m:
                    print 'x' * 100
                    print filename
                    print item
                    print
                lines = m.group(1).strip()
                tail = m.group(2).strip()
                if tail and not re.match('<div class="b-translation__rejex">(.*)</div>', tail):
                    print 'x' * 100
                    print tail
                lines = re.findall('<li class="b-translation__entry">(.*?)</li>', lines)
                for line in lines:
                    m = re.match('^<div class="b-translation__translation"><span class="b-translation__translation-words">(.*)</span></div>(?:<div class="b-translation__examples">(.*)</div>)?$', line)
                    if not m:
                        print 'x' * 100
                        print line
                        continue
                    line = m.group(1)
                    examples = m.group(2)
                    if examples:
                        # print examples
                        # print
                        pass
                    line = remove_span(line)
                    p = re.compile('<a class="b-translation__link ajax ajax-full" href="[^"]+">([^<]+)</a>')
                    line = p.sub(r'[\1]', line)
                    line = line.replace(u'¸', u'ё')
                    line = line.replace(u'<sup class="b-translation__src-num"></sup>', '')
                    # print filename
                    # print line
                    # print
            continue  # ok
        if re.match(u'^<div class="b-translation i-bem" onclick="[^"]+"><div class="b-nothing-found"><div class="b-nothing-found__message">К сожалению, мы ничего не нашли\.</div>Попробуйте поискать <a class="b-link" href="[^"]+">в энциклопедиях</a>\.</div></div>$', content):
            save_file(new_filename, '', encode='utf-8')
            continue  # not found
        if re.match(u'^<div class="b-translation i-bem" onclick="[^"]+"><div class="b-nothing-found"><div class="b-nothing-found__failure">Похоже, что-то пошло не так. Пожалуйста, попробуйте обновить страницу.</div></div></div>$', content):
            # print filename
            old_filename = join(YANDEX_SLOVARI_UK_RU_PATH, filename)
            if exists(old_filename):
                os.rename(old_filename,
                          join(YANDEX_SLOVARI_PATH, 'uk-ru-bad', filename))
                print u"Файл перемещён: ", filename
            continue  # bad
        print content
        print
Exemple #18
0
def create_file():
    words = []
    files = os.listdir(path)
    for filename in files:
        lines = open(join(path, filename)).read().split("\n")
        # print filename.decode('cp1251'), "\t", len(lines)
        # continue
        for line in lines:
            line = line.strip().decode("utf-8")
            if not line or line in [
                u"{{Русский индекс}}",
                u"{{Русский индекс}}",
                u"__TOC__",
                u"[[ъ]]",
                u"[[ь]]",
                u":''В русском языке нет слов, начинающихся c твёрдого знака.''",
                u":''В русском языке нет слов, начинающихся с мягкого знака.''",
                u"[[uk:НА]]",
                u"[[uk:НЕ]]",
                u"[[uk:ПО]]",
            ]:
                continue
            skip_patterns = [
                u"^см. \[\[(/[а-яё])\|([а-яё]{2})\]\]$",
                u"^== ?([-а-яё/]+) ?==$",
                u"^\[\[[a-z]{2}:index [а-яё]+\]\]$",
                u"^\[\[en:Index:Russian/[а-яё]\]\]$",
            ]
            skip = False
            for pattern in skip_patterns:
                if match(pattern, line):
                    skip = True
                    break
            if skip:
                continue
            m = re.match(u"^\* ?\[\[([^]]+)\]\]!?( ?\{\{И1\}\})?$", line)
            if m:
                word = m.group(1)
                lopatin = m.group(2)
                # print line
                # print word, lopatin
                words.append(word)
            else:
                m = match(u"^\* ?\[\[([^]]+)\]\]!?([^{}]*)( ?\{\{И1\}\})?$", line)
                if m:
                    word = m.group(1)
                    # print repr(word)
                    # print repr(word.encode('utf-8'))
                    lopatin = m.group(3)
                    words.append(word)
                    # print line
                    # print word, lopatin
                else:
                    print "#" * 100
                    print filename.decode("cp1251")
                    print line
    locale.setlocale(locale.LC_ALL, "")  # todo: try ru.UTF-8
    words = sorted(words, cmp=locale.strcoll)
    save_file(join(path, "..", "wiktionary-index-2013-08-03-text.txt"), "\n".join(words), "utf-8")
    # words2 = sorted(["# [[%s]]" % word for word in words], key=lambda s: s.lower())
    words2 = ["# [[%s]]" % word for word in words]
    save_file(join(path, "..", "wiktionary-index-2013-08-03-wiki.txt"), "\n".join(words2), "utf-8")
    # words3 = sorted(["# [[:%s]]" % word for word in words], key=lambda s: s.lower())
    words3 = ["# [[:%s]]" % word for word in words]
    save_file(join(path, "..", "wiktionary-index-2013-08-03-wiki2.txt"), "\n".join(words3), "utf-8")
Exemple #19
0
 def after(self):
     print dt(), 'finished!', datetime.now() - self.s
     if settings.USE_FINISHED_FILENAME:
         save_file(self.finished_filename, unicode(datetime.now()))
Exemple #20
0
def get_and_save_wiki(title, filename):
    url = get_edit_page_url(title)
    content = download(url)
    wiki = extract_wiki_text(content)
    save_file(filename, wiki, 'utf8')
    return wiki
Exemple #21
0
    def process_item(self, item, i):
        # todo: create external mechanism of pausing work (actual for big processors)
        try:
            title = item.title()
        except InvalidTitle:
            print 'Wrong title', '#' * 120
            return
        if ':' in title:  # todo: we need this only for RecentProcessor
            if title.startswith(u"Шаблон:"):
            # if title.startswith(u"Категория:") or title.startswith(u"Шаблон:"):
                # print '-' * 40
                pass
            else:
                return
        if self.readonly:
            return self.process_item_readonly(item, title, i)
        # if Page.objects.filter(title=title):
        #     print dt(), title, '- exists'
        #     return
        page = self.get_page(title, i)
        if not page:
            return
        content, content_to_db, edited, redirect = self.get_data(item)
        if not edited:
            # print dt(), title.encode('cp1251'), '-', 'DELETED', '#' * 20
            print dt(), '& PAGE WAS DELETED          - %d - & %s  // pk=%d' \
                        % (i, transliterate(title), page.pk)
            # print dt(), transliterate(title), '-', 'DELETED', '#' * 10
            # print dt(), '-', 'DELETED', '#' * 20
            page.delete_and_log()
            return
        # print dt(), edited, '-', i, '-', title.encode('cp1251'), '| pk =', page.pk
        if not self.output_interval or not i % self.output_interval:
            print dt(),
            if ':' in title:
                print ':',
            else:
                print '#',
            print edited, '-', i, '-',
            if ':' in title:
                print ':',
            else:
                print '#',
            print transliterate(title), ' // pk=%s' % page.pk
        # print dt(), edited, '-', i, '-', '| pk =', page.pk
        log = transliterate(title)

        oldest = next(item.revisions(reverseOrder=True, total=1, content=True))
        created_at = aware(convert_wiki_date(oldest.timestamp))
        created_author = oldest.user
        created_lang = '?'
        if oldest.text is None:
            created_lang = '??'
        else:
            # print oldest.text
            # print repr(oldest.text)
            # print
            m = re.search(u'\{\{-([-\w]+|Праславянский)-(?:\|[^}]+)?\}\}',
                          oldest.text, flags=re.MULTILINE | re.UNICODE)
            m2 = re.search(u'\{\{заголовок\|(en|tr|jbo|la|it|es|fo|da|de|pt|hr|pl|fi|lv|nl|sv|no|io|gd|az|ms|id|nv|nds|nah|hu|nrm|vls|fur|ga|hu|lb|hsb|li|tpi|gv|fr|cy|fy|sc|fo|zu|sw|mg|oc|ca|qu|ln|eo|so|cs|uz|et|vo|ku|su|sk|mi|kw|bar|br|an|sq|bs|af)\|add=\w*\}\}', oldest.text)
            m_new = re.search(u'\{\{NEW\|lang=([-a-z]+)\|cat=', oldest.text, flags=re.UNICODE | re.IGNORECASE)
            if m:
                created_lang = m.group(1)
            elif re.search(u'^= *Праславянский', oldest.text, flags=re.MULTILINE | re.UNICODE):
                created_lang = u'Праславянский'
            elif re.search(u'=<div style="background-color0?:#\{\{h1c\}\}">Эсперанто</div>=', oldest.text, flags=re.UNICODE | re.IGNORECASE):
                created_lang = u'eo'
            elif m2:
                created_lang = m2.group(1)
            elif re.search(u'== *\[\[?(:w)?:en:[^|]+\|английский\]\]? *== *\n', oldest.text, flags=re.UNICODE | re.IGNORECASE):
                created_lang = 'en'
            elif re.search(u'== *\[\[?(:w)?:de:[^|]+\|немецкий\]\]? *== *\n', oldest.text, flags=re.UNICODE | re.IGNORECASE):
                created_lang = 'de'
            elif re.search(u'== *\[?\[?(английский|english)\]?\]? *== *\n', oldest.text, flags=re.UNICODE | re.IGNORECASE):
                created_lang = 'en'
            elif re.search(u'== *\[?\[?(французский)\]?\]? *== *\n', oldest.text, flags=re.UNICODE | re.IGNORECASE):
                created_lang = 'fr'
            elif re.search(u'== *\[?\[?(италь?янский)\]?\]? *== *\n', oldest.text, flags=re.UNICODE | re.IGNORECASE):
                created_lang = 'it'
            elif re.search(u'== *\[?\[?(Нидерландский)\]?\]? *== *\n', oldest.text, flags=re.UNICODE | re.IGNORECASE):
                created_lang = 'nl'
            elif re.search(u'\{\{(английский|en)\}\}', oldest.text, flags=re.UNICODE | re.IGNORECASE):
                created_lang = 'en'
            elif re.search(u'\{\{(Нидерландский|nl)\}\}', oldest.text, flags=re.UNICODE | re.IGNORECASE):
                created_lang = 'nl'
            elif re.search(u'\{\{(немецкий|de)\}\}', oldest.text, flags=re.UNICODE | re.IGNORECASE):
                created_lang = 'de'
            elif re.search(u'\{\{(it)\}\}', oldest.text, flags=re.UNICODE | re.IGNORECASE):
                created_lang = 'it'
            elif m_new:
                created_lang = m_new.group(1)
            elif re.search(u'#(redirect|перенаправление)', oldest.text, flags=re.MULTILINE | re.UNICODE | re.IGNORECASE):
                created_lang = u'-'
            else:
                save_file(settings.FILES_PATH + "/errors/created_lang/%s.txt" % page.pk, oldest.text.encode('utf-8'))

        # print
        # print transliterate(title), created_at
        # print transliterate(oldest.user), transliterate(created_lang)
        # print

        self.update_data(page, content, content_to_db, edited, redirect, log,
                         created_at, created_author, created_lang)
        if ':' in title:
            return
        return edited
 def after(self):
     super(A, self).after()
     save_file(settings.FILES_PATH + '/data/not_lopatin_transcriptions.txt',
               self.result, encode=u'utf-8')
Exemple #23
0
# coding: utf-8
from dictionaries.utils.file import save_file
from wikt.commons.utils.wikibot import get_wiki_page_content

s = u'|egy=[[𓊪𓏏𓇯]]'

with open('test_utf.txt', mode='w') as f:
    f.write(s.encode('utf-8'))


content = get_wiki_page_content(u'небо')
save_file(u'небо-t.txt', content, encode=u'utf-8')
Exemple #24
0
from wikt.models import Page
from wikt.tasks.parse.yandex_slovari.common import YANDEX_SLOVARI_UK_RU_PATH


class AppURLopener(urllib.FancyURLopener):
    version = "Opera/9.80 (Windows NT 6.2; WOW64) Presto/2.12.388 Version/12.16"

urllib._urlopener = AppURLopener()


words = Page.objects.filter(page_content__content__contains="= {{-uk-").\
    values_list('title', flat=True)
print len(words)

for word in words:
    if word[0] == word[0].upper():
        continue
    # print dt(), word
    filename = join(YANDEX_SLOVARI_UK_RU_PATH, word)
    if os.path.exists(filename):
        # print u'→ exist'
        continue
    print dt(), word
    word_url = urllib.quote_plus(word.encode('utf-8'))
    # print urllib.quote_plus("привет")
    url = u"http://slovari.yandex.ru/%s/uk-ru" % word_url
    content = urllib.urlopen(url).read()
    # print content
    save_file(filename, content)
    time.sleep(15)
Exemple #25
0
#         if not self.i % 100:
#             time.sleep(2)


if __name__ == '__main__':
    # OldDumpContentToFiles().run()

    old_block_num = 0
    block = [''] * 1000
    # print block
    # sys.exit()
    last_pk = Page.objects.order_by('-pk')[0].pk
    for page in Page.objects.iterate(prefetch=['page_content']):  # start_pk=1195000
        new_block_num = page.id / 1000
        i = page.id % 1000
        # if not page.id % 10000:
        #     print page.id
        # if page.id < 1196000:
        #     continue
        if new_block_num != old_block_num or page.id == last_pk:
            if page.id == last_pk:
                block[i] = page.page_content.content
            filename = "%sx" % old_block_num
            path = os.path.join(settings.CONTENT_FILES_PATH, filename)
            print path
            save_file(path, G.separator.join(block), encode='utf-8')
            block = [''] * 1000
            old_block_num = new_block_num
        block[i] = page.page_content.content
        # print page.title