Esempio n. 1
0
def get_info(input_url, source):
    toc_html = query_info.get_page_html(input_url)
    uid = generate_info.random_uid()

    source_info = parse_info.original_source_info()

    series_name = query_info.get_string(
        toc_html, source_info[source]['html']['series_name'])
    author_name = query_info.get_string(
        toc_html, source_info[source]['html']['author_name'])
    author_name = parse_info.format_author(source, author_name)

    # Unable to factorize
    #    if query_info.volumes_exist(query_info.get_index_children(query_info.get_index(toc_html, source_info[source]['html']['html_index'])), source_info[source]['html']['vol_and_chap']):
    #        volume_info = query_info.scrape_chapter_info_by_volume(query_info.get_index_children(query_info.get_index(toc_html, source_info[source]['html']['html_index'])), source_info[source]['html']['vol_and_chap'], source_info[source]['html']['chap_name_url'])
    html_index_id = source_info[source]['html']['html_index']
    vol_and_chap_id = source_info[source]['html']['vol_and_chap']
    chap_name_url_id = source_info[source]['html']['chap_name_url']

    index = query_info.get_index(toc_html, html_index_id)
    index_children = query_info.get_index_children(index)

    if query_info.volumes_exist(index_children, vol_and_chap_id):
        volume_info = query_info.scrape_chapter_info_by_volume(
            query_info.get_index_children(index), vol_and_chap_id,
            chap_name_url_id)

    else:
        volume_info = query_info.get_dict_key(series_name)
        volume_info = query_info.scrape_all_chapter_info(
            volume_info,
            query_info.get_index_children(
                query_info.get_index(
                    toc_html, source_info[source]['html']['html_index'])),
            source_info[source]['html']['vol_and_chap'], series_name,
            source_info[source]['html']['chap_name_url'])


#    volume_names = parse_info.extract_volume_names_and_numbers(volume_info)
    volume_names = parse_info.extract_volume_names(
        query_info.get_index_children(index), vol_and_chap_id)
    #    info = volume_info['info']
    #    volume_names = volume_info['names']

    source_info['input_url'] = input_url
    source_info['source'] = source
    source_info[source]['info'] = {
        'series_name': series_name,
        'author_name': author_name,
        'volume_info': volume_info,
        'volume_names': volume_names,
        'uid': uid,
        'request_id': 'PLACEHOLDER'
    }
    app.logger.debug('Scraper - Source info retrieved')
    app.logger.debug('%s' % source_info)
    return source_info
Esempio n. 2
0
def generate_single_volume_to_memory(memory_file, source_info, volume_number):
    input_url = parse_info.retrieve_input_url_from(source_info)
    source = parse_info.retrieve_source_from(source_info)
    series_name = parse_info.retrieve_series_name_from(source_info)
    author_name = parse_info.retrieve_author_name_from(source_info)
    volume_names = parse_info.retrieve_volume_names_from(source_info)
    volume_info = parse_info.retrieve_volume_info_from(source_info)
    uid = parse_info.retrieve_uid_from(source_info)

    volume_name = volume_names[volume_number]
    volume_formatted_name = parse_info.format_volume_name(
        volume_number, volume_name)
    volume_chapters = volume_info[volume_name]

    epub = zipfile.ZipFile(memory_file, 'w')

    mimetype.create_mimetype(epub)
    container.create_container(epub)
    toc_string = toc.create_toc(uid, volume_formatted_name)
    content_string_first_half = content.create_content(volume_formatted_name,
                                                       author_name, uid)
    content_string_second_half = content.create_middle()

    for j, (tuple_ch_name, tuple_ch_url) in enumerate(volume_chapters):
        chapter_url = chapters.format_chapter_url(
            source_info[source]['html']['url_regex'], tuple_ch_url, input_url)
        chapter_html = query_info.get_page_html(chapter_url)
        chapter_number = chapters.chapter_get_text_for_element(
            chapter_html, source_info[source]['html']['chapter_number'])
        chapter_text = chapters.chapter_get_text_for_element(
            chapter_html, source_info[source]['html']['chapter_text'])
        chapter_complete = chapters.combine_name_and_text(
            chapter_number, chapter_text)

        xhtml_name = 'chap' + str(j)
        chapters.create_chapter(tuple_ch_name, chapter_complete, xhtml_name,
                                epub)

        toc_string = toc.add_nav(toc_string, xhtml_name, str(j))

        content_string_first_half = content.add_item_id(
            content_string_first_half, xhtml_name)
        content_string_second_half = content.add_itemref(
            content_string_second_half, xhtml_name)

    toc.finish_toc(toc_string, epub)
    content_string = content_string_first_half + content_string_second_half
    content.finish_content(content_string, epub)
    stylesheet.create_stylesheets(epub)
    nav.create_nav(epub)
    template.create_template(epub)
    epub.close()
    app.logger.debug(
        'Scraper - One volume written to memory - Series: %s - Volume name: %s'
        % (series_name, volume_name))
    return memory_file
Esempio n. 3
0
def generate_all_volumes_to_disk(source_info):
    input_url = parse_info.retrieve_input_url_from(source_info)
    source = parse_info.retrieve_source_from(source_info)
    series_name = parse_info.retrieve_series_name_from(source_info)
    author_name = parse_info.retrieve_author_name_from(source_info)
    volume_names = parse_info.retrieve_volume_names_from(source_info)
    volume_info = parse_info.retrieve_volume_info_from(source_info)
    uid = parse_info.retrieve_uid_from(source_info)

    folders.create_series_folder(series_name)

    for volume_number in volume_names.keys():
        #        if i == 1:
        #            break
        volume_name = volume_names[volume_number]
        volume_formatted_name = parse_info.format_volume_name(
            volume_number, volume_name)
        volume_chapters = volume_info[volume_name]

        epub_file_name = volume_formatted_name + '.epub'
        epub = zipfile.ZipFile(
            './download/{}/{}'.format(series_name, epub_file_name), 'w')
        mimetype.create_mimetype(epub)
        container.create_container(epub)
        toc_string = toc.create_toc(uid, volume_formatted_name)
        content_string_first_half = content.create_content(
            volume_formatted_name, author_name, uid)
        content_string_second_half = content.create_middle()

        for j, (tuple_ch_name, tuple_ch_url) in enumerate(volume_chapters):
            chapter_url = chapters.format_chapter_url(
                source_info[source]['html']['url_regex'], tuple_ch_url,
                input_url)
            chapter_html = query_info.get_page_html(chapter_url)
            chapter_number = chapters.chapter_get_text_for_element(
                chapter_html, source_info[source]['html']['chapter_number'])
            chapter_text = chapters.chapter_get_text_for_element(
                chapter_html, source_info[source]['html']['chapter_text'])
            chapter_complete = chapters.combine_name_and_text(
                chapter_number, chapter_text)

            xhtml_name = 'chap' + str(j)
            chapters.create_chapter(tuple_ch_name, chapter_complete,
                                    xhtml_name, epub)

            toc_string = toc.add_nav(toc_string, xhtml_name, str(j))

            content_string_first_half = content.add_item_id(
                content_string_first_half, xhtml_name)
            content_string_second_half = content.add_itemref(
                content_string_second_half, xhtml_name)

        toc.finish_toc(toc_string, epub)
        content_string = content_string_first_half + content_string_second_half
        content.finish_content(content_string, epub)
        stylesheet.create_stylesheets(epub)
        nav.create_nav(epub)
        template.create_template(epub)
        epub.close()
        print('Completed: ' + volume_formatted_name)

    print('All files completed!')