-
Notifications
You must be signed in to change notification settings - Fork 0
/
presentation.py
221 lines (182 loc) · 5.95 KB
/
presentation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
#!/usr/bin/env python
import json
import latexslides
import os
import sys
sys.path.append('summarizer')
import summarize
tables = []
images = []
def sanitize(text):
"""Strips newlines '\n', '\t' and '\r'."""
text.replace('\n',' ')
text.replace('\r','')
text.replace('\t',' ')
return text
def runParser(pathToPaper):
"""Executes the HTML parser."""
cmd = "parser/parser " + pathToPaper + " > sections.json"
os.system(cmd)
def getSections(filename, pathToPaper):
runParser(pathToPaper)
lines = open(filename).read()
sections = json.loads(lines, "ISO-8859-1")
return sections
def joinSections(raw_sections):
"""Joins sections with same Section IDs"""
prevID = None
sections = []
secText = ""
secTitle = ""
secID = ""
tmpSec = {}
for section in raw_sections:
# Check for tables
if 'table' in section:
tables.append(section)
continue
ID = section['section']
title = sanitize(section['title'])
# Check for images
if 'attr' in section:
if section['attr'] == 'img':
images.append(section)
continue
if 'text' in section:
text = sanitize(section['text'])
else:
text = ""
if ID != prevID:
# Dump previous section if not null
if prevID != None:
tmpSec['title'] = secTitle
tmpSec['section'] = secID
tmpSec['text'] = secText
sections.append(tmpSec)
tmpSec = {}
# Now insert own data
secText = text
secTitle = title
secID = ID
else:
# Just append text to previous section
if text:
secText += " " + text
prevID = ID
# Dump remaining elements
tmpSec['title'] = secTitle
tmpSec['section'] = secID
tmpSec['text'] = secText
sections.append(tmpSec)
return sections
def genTitleSlide(title):
"""Generates the title slide that appears on first page."""
slide = latexslides.TextSlide(
title = title,
block_heading = title,
)
return slide
def genTextSlide(ID, title, text):
"""Generates individual text slide."""
secID = ID
rawText = text
rawText.replace(' ', '')
if rawText == "":
return None
bullets = summarize.summarize_page(text)
slide = latexslides.BulletSlide(secID, bullets, block_heading = title)
return slide
def convertToJPG(path):
extension = path.split('.')[-1]
if extension == 'gif':
cmd = 'i=' + path + ';convert $i ${i%.gif}.jpg'
os.system(cmd)
newPath = path[:-4] + '.jpg'
return newPath
else:
return path
def genImgSlide(image):
slide = latexslides.Slide(image['title'],
figure=convertToJPG(image['path']),
figure_pos='w',
figure_fraction_width=0.3,
left_column_width=0.8)
return slide
def extractTable(rawTable):
table = []
for row in rawTable:
tmpRow = []
for col in row:
tmpCol = ""
for element in col:
if 'text' in element and 'attr' not in element:
tmpCol += element['text']
if tmpCol == "":
return []
tmpRow.append(tmpCol)
table.append(tmpRow)
return table
def genTableSlide(table):
formattedTable = extractTable(table['table'])
if formattedTable == []:
return None
slide = latexslides.TableSlide(
title = table['section'],
table = formattedTable,
block_heading = table['title'],
)
return slide
def genLatex(collection, filename):
"""Generates laTeX file from given collection of slides."""
author_and_inst = [("HTML2Presentation", "IIIT Hyderabad")]
slides = latexslides.BeamerSlides(title="Slide Ttile",
titlepage=False,
toc_heading=None,
author_and_inst=author_and_inst,)
slides.add_slides(collection)
# Dump to file
slides.write(filename)
def genPDF(filename):
"""Generates PDF of the TeX file whose name is supplied as argument."""
newlines = 'echo -e "' + '\\n' * 100 + '"' # Hack to continue when pdflatex halts.
cmd = newlines + " | pdflatex " + filename + " --shell-escape 2>/dev/null >/dev/null"
os.system(cmd)
def getPresentation():
"""Main function to get presentations. Wrapper for all other functions."""
if len(sys.argv) != 2:
return "Proper arguments not provided. Aborting!!"
pathToPaper = sys.argv[1]
if os.path.isfile(pathToPaper) is False:
return "Specified file does not exists. Aborting!!"
raw_sections = getSections("sections.json", pathToPaper)
sections = joinSections(raw_sections)
collection = []
for section in sections:
ID = section['section']
title = section['title']
text = section['text']
if ID == "Section: 0":
slide = genTitleSlide(title)
else:
slide = genTextSlide(ID, title, text)
if slide is not None:
collection.append(slide)
# Check if any images exist for this section.
for image in images:
if image['section'] == ID:
#image belongs to this section. Add its slide
slide = genImgSlide(image)
collection.append(slide)
# Check if any tables exist for this section.
for table in tables:
if table['section'] == ID:
#table belongs to this section. Add its slide
slide = genTableSlide(table)
if slide is not None:
collection.append(slide)
filename = "presentation.tex"
genLatex(collection, filename)
genPDF(filename)
return "PDF successfully printed to `presentation.pdf`."
if __name__ == "__main__":
print getPresentation()