-
Notifications
You must be signed in to change notification settings - Fork 0
/
extractor.py
119 lines (107 loc) · 3.85 KB
/
extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import json
import subprocess as sp
from pathlib import Path
from docx import Document
from pptx import Presentation
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams, LTTextBox
from pdfminer.pdfpage import PDFPage
from utils import clean_line
def process_doc(file_path: Path):
""" Extracts text from .doc files
Args:
file_path(Path) : Path object that contains the file_path of the .doc file
Returns:
list : The sentences extracted from the file
"""
try:
p = sp.run(["catdoc", str(file_path)], capture_output=True)
output = p.stdout.decode()
sentences = [clean_line(line) for line in output.split("\n\n") if line]
return sentences
except FileNotFoundError as e:
print("Unable to process", file_path)
print(e.strerror)
return []
def process_docx(file_path: Path):
""" Extracts text from .docx files
Args:
file_path(Path) : Path object that contains the file_path of the .docx file
Returns:
list : The sentences extracted from the file
"""
doc = Document(file_path)
sentences = []
for para in doc.paragraphs:
for line in para.text.split("."):
line = clean_line(line)
if line:
sentences.append(line)
return sentences
def process_pptx(file_path: Path):
""" Extracts text from .pptx files
Args:
file_path(Path) : Path object that contains the file_path of the .pptx file
Returns:
list : The sentences extracted from the file
"""
prs = Presentation(file_path)
text_runs = []
for i, slide in enumerate(prs.slides):
if i < 1:
continue
for shape in slide.shapes:
if not shape.has_text_frame:
continue
for paragraph in shape.text_frame.paragraphs:
para_text = "".join(run.text for run in paragraph.runs)
if para_text:
text_runs.append(para_text)
return text_runs
def process_pdf(file_path: Path):
""" Extracts text from .pdf files
Args:
file_path(Path) : Path object that contains the file_path of the .pdf file
Returns:
list : The sentences extracted from the file
"""
sentences = []
with open(file_path, "rb") as f:
resmgr = PDFResourceManager()
laparams = LAParams()
device = PDFPageAggregator(resmgr, laparams=laparams)
interpreter = PDFPageInterpreter(resmgr, device)
for page in PDFPage.get_pages(f, caching=True, check_extractable=True):
interpreter.process_page(page)
layout = device.get_result()
for lt_obj in layout:
if isinstance(lt_obj, LTTextBox):
sentences.append(clean_line(lt_obj.get_text()))
return sentences
def extract_sentences(file_path: Path):
""" Extract sentences from file and store in JSON
Args:
file_path(Path) : Path object that contains the path of the file
Returns:
list : List of all sentences extracted from the file
"""
json_file = file_path.with_suffix(".json")
if json_file.exists():
with open(json_file) as f:
return json.load(f)["sentences"]
sentences = None
if file_path.suffix == ".pptx":
sentences = process_pptx(file_path)
elif file_path.suffix == ".doc":
sentences = process_doc(file_path)
elif file_path.suffix == ".docx":
sentences = process_docx(file_path)
elif file_path.suffix == ".pdf":
sentences = process_pdf(file_path)
if not sentences:
return []
data = {"name": str(file_path), "sentences": sentences}
with open(json_file, "w") as f:
json.dump(data, f, indent=4) # Create/Dump json file with file's text
return sentences