Example #1
0
def parse_fb2(file_name: Path, get_transitions_func: Callable = None, coin_flip: List[str] = None):
    print(file_name)

    root = parse(file_name.read_bytes())
    book = parse_book_info(root)

    # print('book:', book)
    print('title:', book.title)
    print('author:', book.author)
    print('annotation:', repr(book.annotation))
    print('coverpage_id:', book.coverpage_id)
    print('sequence_name:', book.sequence_name)
    print('sequence_num:', book.sequence_num)
    print('publisher:', book.publisher)
    print('images:', list(book.images))

    dir_book = DIR_DUMP_BOOKS / file_name.name

    for section in root.select('body > section'):
        section_id = clear_number(section.select_one('title > p'))
        if not section_id:
            section_id = "0"

        # Удаление тегов-заголовков с номерами страниц
        for title in section.select('title'):
            title.decompose()

        tags = section.find_all(recursive=False)

        if get_transitions_func:
            transitions = get_transitions_func(tags)
        else:
            transitions = get_transitions(tags)

        images = get_images(tags)

        preprocess_tags(tags)
        section_tag = parse(''.join(map(str, tags)))
        html_section = get_section_text(section_tag, None if section_id == '0' else section_id)

        if coin_flip:
            is_coin_flip = section in coin_flip
        else:
            is_coin_flip = False

        book.add_section(
            id=section_id,
            text=html_section,
            transitions=transitions,
            images=images,
            coin_flip=is_coin_flip,
        )

    print('sections:', len(book.sections))

    book.save(dir_book)
Example #2
0
from tools.parsers.book import parse_book_info
from tools.parsers.utils import (
    parse, get_section_text, preprocess_tags, get_transitions, clear_number, get_images, DIR
)


# Страницы, в которых нужно монету подбрасывать
COIN_FLIP = [
    '52'
]


file_name = DIR / 'Ужастики-2' / 'Stayn_Beregis-Lilovoy-Pasty-_RuLit_Me.fb2'
print(file_name)

root = parse(file_name.read_bytes())
book = parse_book_info(root)

# print('book:', book)
print('title:', book.title)
print('author:', book.author)
print('annotation:', repr(book.annotation))
print('coverpage_id:', book.coverpage_id)
print('sequence_name:', book.sequence_name)
print('sequence_num:', book.sequence_num)
print('publisher:', book.publisher)
print('images:', list(book.images))

dir_book = DIR_DUMP_BOOKS / file_name.name

section_by_tags = defaultdict(list)
Example #3
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

__author__ = 'ipetrash'

from pathlib import Path
from tools.parsers.utils import parse

for file_name in Path('Ужастики-2').glob('*.fb2'):
    root = parse(file_name.read_bytes())

    title_info_tag = root.select_one('description > title-info')

    sequence_tag = title_info_tag.select_one('sequence')
    sequence_name = sequence_tag.get('name') if sequence_tag else None
    try:
        sequence_num = sequence_tag.get('number') if sequence_tag else None
        sequence_num = int(sequence_num)
    except:
        sequence_num = None

    print('{:4}'.format(str(sequence_num)), file_name)
def parse_fb2(file_name: Path, coin_flip: List[str]):
    print(file_name)

    root = parse(file_name.read_bytes())
    book = parse_book_info(root)

    # print('book:', book)
    print('title:', book.title)
    print('author:', book.author)
    print('annotation:', repr(book.annotation))
    print('coverpage_id:', book.coverpage_id)
    print('sequence_name:', book.sequence_name)
    print('sequence_num:', book.sequence_num)
    print('publisher:', book.publisher)
    print('images:', list(book.images))

    dir_book = DIR_DUMP_BOOKS / file_name.name

    section_by_tags = defaultdict(list)
    tags = None

    for tag in root.select_one('body > section').children:
        # If tag is not NavigableString
        if not tag.name:
            continue

        if is_start_section(tag):
            section = get_plaintext(tag)

            tags = []
            section_by_tags[section] = tags
            continue

        if tags is not None:
            tags.append(tag)

    end_number = 0
    link_to_section = 0

    for section, tags in section_by_tags.items():
        transitions = get_transitions(tags)
        images = get_images(tags)

        if not transitions:
            end_number += 1

        link_to_section += len(transitions)

        preprocess_tags(tags)
        section_tag = parse(''.join(map(str, tags)))
        html_section = get_section_text(section_tag, section)

        book.add_section(id=section,
                         text=html_section,
                         transitions=transitions,
                         images=images,
                         coin_flip=section in coin_flip)

    print('sections:', len(book.sections))

    book.save(dir_book)