Beispiel #1
0
def article_transform(article):
    global minutes_per_page
    global output_folder_dest
    global bundles_produced
    global pdf_names
    global all_pages
    global all_words
    current_directory = os.path.dirname(os.path.realpath(__file__))
    directory_name = 'pdf_papers'
    this_bundle = PyPDF2.PdfFileWriter()
    this_book = open(
        os.path.join(current_directory, directory_name, str(article)), 'rb')
    read_book = PyPDF2.PdfFileReader(this_book)
    name_without_pdf = str(re.sub('\.pdf$', '', str(article)))

    #writes first page
    numPages = read_book.getNumPages()
    how_many_words = word_counter.count_words(read_book, numPages)
    all_words += how_many_words
    all_pages += numPages
    first_page.Title = 'Article: ' + name_without_pdf
    first_page.SubTitle = ' Week ' + str(folder_index) + ' Reading Material'
    first_page.Name = str(numPages) + ' Pages (' + str(
        minutes_per_page * numPages) + ' Minutes)'
    first_page.go(
        os.path.join('title pages', name_without_pdf + '_title_page.pdf'))

    #adds cover and black page with bookmark

    title_page_file = open(
        os.path.join('title pages', name_without_pdf + '_title_page.pdf'),
        'rb')
    read_title_page = PyPDF2.PdfFileReader(title_page_file)
    this_bundle.addPage(read_title_page.getPage(0))
    #this_bundle.addBookmark(name_without_pdf, 1, parent=None, color=(0.0,0.0,1.0))
    this_bundle.setPageMode("/UseOutlines")

    for page in range(numPages):
        this_bundle.addPage(read_book.getPage(page))

    directory_name = output_folder_dest
    outputStream = open(
        os.path.join(output_folder_dest, 'Bundles',
                     re.sub('\.pdf$', '', str(article)) + '_bundle.pdf'), 'wb')

    while True:
        try:
            this_bundle.write(outputStream)
            pdf_names.append((os.path.join(
                output_folder_dest, 'Bundles',
                re.sub('\.pdf$', '', str(article)) + '_bundle.pdf'), 'rb'))
            bundles_produced += 1
        except PyPDF2.utils.PdfReadError:
            print('CANT WRITE PDF')
        break
    outputStream.close()
def main(argv):
    file_name = "words.txt" if len (argv) == 1 else argv[1]
    with open(file_name, "rb") as fp:
        counted = word_counter.count_words(fp.read())
    word_counter.print_counted_words(counted)
    def test_nasty(self):
        text = """you haint no objections to sharing a harpooneer's blanket,
have ye? I s'pose you are goin' a-whalin',
so you'd better get used to that sort of thing."""
        assert count_words(text) == _____
 def test_biggest(self):
     """An entire book works."""
     text = open('____mobydick_full.txt').read()
     assert count_words(text) > 200000
 def test_wrong_input(self):
     """Non-string fails with a specific error"""
     with pytest.raises(_____) as e_info:
         count_words(777)
 def test_empty(self):
     """Empty input works"""
     text = ''
     assert count_words(text) == _____
def test_words():
    """example with lots of special characters works"""
    text = """you haint no objections to sharing a harpooneer's blanket,
have ye? I s'pose you are goin' a-whalin',so you'd better get used to that sort of thing."""
    assert count_words(text) == 32
def test_count_words_tabs():
    """words are separated by tabs as well"""
    text = "the\twhite\twhale"
    assert count_words(text) == 3
def test_count_words():
    """Count words in a short sentence"""
    n = count_words("Call me Ishmael")
    assert n == 3
Beispiel #10
0
import PDFtoTXTConvert
import word_counter
import os

d = {}
word_freq = []
tf_idf_freq = []

pdf_dir = "./articles/"
txt_dir = "./txt_formats/"

PDFtoTXTConvert.convert_multiple(pdf_dir, txt_dir)
word_counter.get_rid_of_stopword(txt_dir)
word_counter.count_words(word_freq, d)
word_counter.tf_idf_cal(d, txt_dir, tf_idf_freq)

if txt_dir == "": txt_dir = os.getcwd() + "\\"  # if no txt_dir passed in
for txt in os.listdir(txt_dir):
    file_extension = txt.split(".")[-1]
    if file_extension == "txt":
        txt_filename = txt_dir + txt
        os.remove(txt_filename)

txt_dir = "./filtered_txt/"
if txt_dir == "": txt_dir = os.getcwd() + "\\"  # if no txt_dir passed in
for txt in os.listdir(txt_dir):
    file_extension = txt.split(".")[-1]
    if file_extension == "txt":
        txt_filename = txt_dir + txt
        os.remove(txt_filename)