Esempio n. 1
0
def find_good_pometa():
    category_name = u"Категория:Шаблоны_помет"
    categories = [get_category(category_name)]
    categories += get_subcategories(category_name)
    pometa_list = list()
    for category in categories:
        # print category.title()
        for article in category.articles():
            m = re.match(u'Шаблон:(.*\.)', article.title())
            if m:
                pometa = m.group(1)
                pometa_list.append(pometa)
                print pometa
    return pometa_list
Esempio n. 2
0
def analyze_category(title):
    queue = [title]
    while queue:
        title = queue.pop()
        print u"→", title
        if re.search(r'[\\"?*|<>]', title):
            print " ×", title, "—", "bad symbols in title"
            continue
        if title in processed_titles:
            print " ×", title, "—", "already used"
            continue
        lang_skipping = re.match(u".*(/[-a-z]{2,8}|по языкам)$", title)
        # if lang_skipping:
        #     print ' ×', title, '—', 'lang: skipping'
        #     continue
        processed_titles.append(title)
        file_title = title.replace("/", "#").replace(":", "%")
        if lang_skipping:
            dirname = lang_skipping.group(1).replace("/", "")
            skip_path = join(categories_path, "#", dirname)
            if not exists(skip_path):
                os.mkdir(skip_path)
            filename = join(skip_path, file_title)
        else:
            filename = join(categories_path, file_title)
        # complete_filename = join(categories_path, 'complete', file_title)
        # blocked_filename = join(categories_path, 'blocked', file_title)
        # if exists(complete_filename):
        #     # print u' ×', 'already exist'
        #     return
        # if exists(blocked_filename):
        #     print u' ×', title, '—', 'blocked'
        #     return
        if exists(filename):
            # print u' ←', 'exist, reading'
            base_titles, sub_titles = read_titles(filename)
        else:
            category = get_category("%s%s" % (category_prefix, title))
            base_titles = process_categories(category.categories())
            sub_titles = process_categories(category.subcategories())
            base_content = "\n".join(map(lambda x: "< %s" % x, base_titles)) or "-"
            sub_content = "\n".join(map(lambda x: "> %s" % x, sub_titles)) or "-"
            content = "%s\n\n%s\n" % (base_content, sub_content)
            save_file(filename, content, encode="utf-8")
            print u" +", title, "—", "saved"
        for sub_title in base_titles + sub_titles:
            if sub_title not in processed_titles:
                queue.append(sub_title)