-
Notifications
You must be signed in to change notification settings - Fork 0
/
passage_scrapper.py
executable file
·168 lines (147 loc) · 5.59 KB
/
passage_scrapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
#!./pdf_python/bin/python
# -*- coding: utf-8 -*-
import os
import re
import string
import sys
from model import Book, Chapter, Verse, session
from reportlab.pdfgen import canvas
from reportlab.lib.colors import *
from reportlab.pdfbase import pdfmetrics, ttfonts
from reportlab.platypus import Paragraph, Spacer, Frame, SimpleDocTemplate, PageBreak
from reportlab.platypus import BaseDocTemplate, NextPageTemplate, PageTemplate
from reportlab.lib.styles import ParagraphStyle
from PyPDF2 import PdfFileMerger, PdfFileReader
class MyPresentation:
HEIGHT = 768
WIDTH = 1024
FONTNAME = 'Arial'
FONTSIZE = 56
BCKGRDCOLOR = (0, 0.2, 0.1)
FONTCOLOR = white
PASSAGE = []
def __init__(self, fname, passage):
""" Constructor """
MyFontObject = ttfonts.TTFont('Arial', 'arial.ttf')
pdfmetrics.registerFont(MyFontObject)
self.doc = SimpleDocTemplate(
fname,
pagesize=(self.WIDTH, self.HEIGHT),
leftMargin=40,
rightMargin=40,
topMargin=100,
bottomMargin=30,
allowSplitting=False
)
self.story = []
self.PASSAGE = passage
def pageCanvas(self, canvas, doc):
"""
Forming of the page layout (backround color...)
"""
canvas.saveState()
canvas.setFont(self.FONTNAME, self.FONTSIZE)
canvas.setFillColorRGB(
self.BCKGRDCOLOR[0], self.BCKGRDCOLOR[1], self.BCKGRDCOLOR[2])
canvas.rect(0, 0, self.WIDTH, self.HEIGHT, stroke=0, fill=1)
canvas.restoreState()
canvas.saveState()
p = canvas.beginPath()
p.rect(0, self.HEIGHT - 80, self.WIDTH, 80)
canvas.clipPath(p, stroke=0)
canvas.linearGradient(
0, self.HEIGHT - 80, self.WIDTH, self.HEIGHT - 80,
(black, blue), extend=False)
canvas.restoreState()
canvas.saveState()
canvas.setFillColorRGB(1, 1, 1)
canvas.setFont(self.FONTNAME, self.FONTSIZE)
canvas.drawString(40, 714, "%s.%s:%s" % (self.PASSAGE[0],
self.PASSAGE[1], self.PASSAGE[2]))
canvas.restoreState()
def runBuild(self, str):
p = ParagraphStyle('test')
p.textColor = self.FONTCOLOR
p.borderWidth = 0
p.fontSize = self.FONTSIZE
p.fontName = self.FONTNAME
p.leading = 12 * self.FONTSIZE / 10
for line in str:
para = Paragraph("%s" % line, p)
self.story.append(para)
self.doc.build(self.story, onFirstPage=self.pageCanvas,
onLaterPages=self.pageCanvas)
class PassageScrapper:
def file_to_scrap(self):
"""
this method is to get the file with initial text as a command-line
argument or as a raw input filename from user and if it's not in TXT
and UTF-8, than to convert it and pass further
"""
if len(sys.argv) == 2:
filename = sys.argv[1]
else:
filename = raw_input('Enter filename: ')
filename_txt = filename[:string.rfind(filename, '.')] + '.txt'
if not os.path.exists(os.path.join(os.getcwd(), filename_txt)):
os.system('libreoffice --headless --convert-to txt:"Text" "' +
filename + '"')
if os.path.exists(os.path.join(os.getcwd(), filename_txt)):
self.find_the_passages(filename_txt)
else:
print "something went wrong. there is no TXT-file"
def find_the_passages(self, fname):
"""
this method is to get a list of passages used in initial text
"""
f = open(fname, 'r')
ftext = f.read().decode('utf-8', 'replace')
p = re.compile(u'\(([0-9]?[а-яА-Яa-zA-Z]+)[ .]{1,2}(\d+):([\d,\-]+)\)')
passages = p.findall(ftext)
f.close()
tmp_list = []
for passage in passages:
index = passages.index(passage)
tmp_list += [str(index) + ".pdf"]
self.create_pdf(passage, tmp_list[-1])
self.merge_PDF(tmp_list)
def get_verses_list(self, vrange):
few = re.compile(u'(\d+)-(\d+)')
single = re.compile(u'[,]*(\d+)[,]*')
vlist = []
if '-' in vrange:
v = few.findall(vrange)
for first, last in v:
vlist += range(int(first), int(last) + 1)
vrange = few.sub('', vrange)
if vrange:
v = single.findall(vrange)
for item in v:
vlist += [int(item)]
return sorted(vlist)
def get_the_passage(self, passage):
verses_list = self.get_verses_list(passage[2])
book = "%" + passage[0] + "%"
verses = session.query(Verse).join(Chapter).join(Book).filter(
Book.shortname.like(book)).filter(Chapter.number == int(
passage[1])).filter(Verse.number.in_(verses_list)).all()
return ([unicode(v.text) for v in verses])
def create_pdf(self, passage, tmp_file):
working_dir = os.path.join(os.getcwd(), '_tmp')
if not os.path.exists(working_dir):
os.mkdir(working_dir)
tmp_PDF = MyPresentation(os.path.join(working_dir, tmp_file),
passage)
text = self.get_the_passage(passage)
#print text
tmp_PDF.runBuild(text)
def merge_PDF(self, tmp_list):
merger = PdfFileMerger()
for filename in tmp_list:
merger.append(PdfFileReader(file(os.path.join(os.getcwd(), '_tmp',
filename), 'rb')))
merger.write("presentation.pdf")
os.system('rm -rf _tmp')
if __name__ == "__main__":
ps = PassageScrapper()
ps.file_to_scrap()