コード例 #1
0
import slate, subprocess, pyttsx

loc = raw_input('Enter location of pdf:')
with open(loc) as f:
    doc = slate.PDF(f)
choice = input("Speak(0)/Save(1):")
if choice == 1:
    page = input('Enter number of to save in 1 file(0 for all):')
    if page == 0:
        page = len(doc)
    i = 0
    while i < len(doc):
        t = ""
        for j in range(i, i + page):
            if i + j >= len(doc):
                break
            t = t + doc[j - 1]
        name = (str)(i) + " to " + (str)(i + page)
        i = i + page
        subprocess.call(["espeak", "-w" + name + ".mp3", t])
else:
    page = input("Enter page number to speak:")
    t = doc[page - 1]
    engine = pyttsx.init()
    engine.say(doc[page - 1])
    engine.runAndWait()
コード例 #2
0
#!/usr/bin/env python2

import slate
import sys

fb = (
    'one two three four five six seven eight nine ten ' +
    'eleven twelve thirteen fourteen fifteen sixteen seventeen eighteen nineteen '
    +
    'twenty twenty-one twenty-two twenty-three twenty-four twenty-five twenty-six twenty-seven'
)

lfn = fb.split()

txt = ''
for fx in lfn:
    fn = 'page-%s.pdf' % fx
    with open(fn) as f:
        t = slate.PDF(f)
    txt += t[0][0:-1]

with open(sys.argv[1], 'w') as f:
    f.write(txt)

# python 03-pages.py  > pages.txt
# base64 -d pages.txt > pages.png
# view the png "We like the moon" Joel and Alex Veitch rathergood.com
# Hmmm
# exiftool pages.png  -> shows some text in the comment
コード例 #3
0
ファイル: tests.py プロジェクト: matrach/oioioi
 def test_pdf_generation(self):
     pdf = StringIO(generator(source=SAMPLE_TEXT, header='header'))
     text = slate.PDF(pdf)
     self.assertEqual(9, len(text))
     self.assertIn('Lorem ipsum dolor', text[0])
     self.assertIn('Sed egestas dui tellus', text[4])
コード例 #4
0
def preprocess_pdf(pdf):
    #had to edit the next line because there was an error.
    #annotators = 'tokenize, ssplit, pos, lemma, ner, entitymentions, coref, sentiment, quote, openie'
    annotators = 'tokenize, ssplit, pos, lemma, ner'
    options = {'openie.resolve_coref': True}
    nlp = StanfordCoreNLP(annotators=annotators, options=options)

    with open(pdf, 'rb') as f:
        doc = slate.PDF(f)

    doc = ' '.join([' '.join(x.split()) for x in doc])

    text_split = doc.split('Abstract')

    if len(text_split) > 1:
        text_no_title = ' '.join(text_split[1:])
        text_no_title = str(text_no_title).encode('latin-1', 'ignore')
        text_no_title = text_no_title[0:1000]

    document = nlp(text_no_title)
    first_sentence = document[0]
    #only formats the first 2 sentences in the abstract
    for token in first_sentence:
        if str(token).isalpha():
            if str(token).islower():
                print(str(token).lower(),
                      '│L│',
                      token.pos,
                      '│',
                      token.ner,
                      '│-',
                      end=" ",
                      sep='')
            else:
                print(str(token).lower(),
                      '│U│',
                      token.pos,
                      '│',
                      token.ner,
                      '│-',
                      end=" ",
                      sep='')
        else:
            print(str(token).lower(),
                  '│L│',
                  token.pos,
                  '│',
                  token.ner,
                  '│-',
                  end=" ",
                  sep='')
    print('\n')

    second_sentence = document[1]
    for token in second_sentence:
        if str(token).isalpha():
            if str(token).islower():
                print(str(token).lower(),
                      '│L│',
                      token.pos,
                      '│',
                      token.ner,
                      '│-',
                      end=" ",
                      sep='')
            else:
                print(str(token).lower(),
                      '│U│',
                      token.pos,
                      '│',
                      token.ner,
                      '│-',
                      end=" ",
                      sep='')
        else:
            print(str(token).lower(),
                  '│L│',
                  token.pos,
                  '│',
                  token.ner,
                  '│-',
                  end=" ",
                  sep='')
    print('\n')