Example #1
0
    def test_extensions_order_ascii_only(self):
        ascii_filename = '12345.txt'
        self.request_head_response(valid_files=[ascii_filename])

        extensions = text._format_download_uri(12345)
        self.assertEqual(extensions.split('/')[-1], ascii_filename)

        extensions = text._format_download_uri(12345, prefer_ascii=True)
        self.assertEqual(extensions.split('/')[-1], ascii_filename)
Example #2
0
    def test_extensions_order_ascii_only(self):
        ascii_filename = '12345.txt'
        self.request_head_response(valid_files=[ascii_filename])

        extensions = text._format_download_uri(12345)
        self.assertEqual(extensions.split('/')[-1], ascii_filename)

        extensions = text._format_download_uri(12345, prefer_ascii=True)
        self.assertEqual(extensions.split('/')[-1], ascii_filename)
Example #3
0
    def test_extensions_order_ascii_first(self):
        ascii_filename = '12345.txt'
        all_files = ['12345-8.txt', '12345-0.txt', '12345.txt']
        self.request_head_response(valid_files=all_files)

        extensions = text._format_download_uri(12345)
        self.assertNotEqual(extensions.split('/')[-1], ascii_filename)

        extensions = text._format_download_uri(12345, prefer_ascii=True)
        self.assertEqual(extensions.split('/')[-1], ascii_filename)
Example #4
0
    def test_extensions_order_ascii_first(self):
        ascii_filename = '12345.txt'
        all_files = ['12345-8.txt', '12345-0.txt', '12345.txt']
        self.request_head_response(valid_files=all_files)

        extensions = text._format_download_uri(12345)
        self.assertNotEqual(extensions.split('/')[-1], ascii_filename)

        extensions = text._format_download_uri(12345, prefer_ascii=True)
        self.assertEqual(extensions.split('/')[-1], ascii_filename)
Example #5
0
total_spaces = 0
accepted = 0

for i, book in enumerate(meta):

    if i + 1 % 10 == 0:
        sys.stderr.write("\rSeen " + str(i + 1) + " books, accepted " +
                         str(accepted) + " with " + str(total_spaces) +
                         " spaces              ")

    book_id, title, author = book.split("\t")
    try:
        e_text = load_etext(int(book_id),
                            mirror="http://gutenberg.readingroo.ms")
        url = _format_download_uri(int(book_id),
                                   mirror="http://gutenberg.readingroo.ms")
    except:  # can't find URI, e.g. UnknownDownloadUriException
        sys.stderr.write("WARN: could not download text ID:" + str(book_id) +
                         "\n")
        continue

    text = strip_headers(e_text).strip().replace("\r", "")

    # Collapse multiline brackets (e.g. multiline figure captions)
    collapsed = ""
    lines = text.split("\n")
    open_bracket = False
    for line in lines:
        if "[" in line and not "]" in line:
            open_bracket = True
        if open_bracket: