def test_embedding_dimensions(self, setup_embedder): embedder, data_manager = setup_embedder train_dataset = data_manager.train_dataset lines, labels = train_dataset.get_lines_labels() for lines_batch in chunks(lines, 10): embedding = embedder(lines_batch) assert embedding.dim() == 3
def test_returns_float_tensors(self, setup_embedder): embedder, data_manager = setup_embedder train_dataset = data_manager.train_dataset lines, labels = train_dataset.get_lines_labels() for lines_batch in chunks(lines, 10): embedding = embedder(lines_batch) assert isinstance(embedding, torch.FloatTensor)
def on_post(self, req, resp) -> Dict[str, Any]: """ Post the base64 url encoded pdf file to sect label. This converts the base64 encoded string to pdf. Reads the pdf line by line and returns the logical section for every line Returns ------- Dict[str, Any] Return the lines with corresponding labels to the client ``{"labels": [], "lines": []}`` """ if self.infer_client is None: self.infer_client = self.model_infer_func(self.model_filepath) file = req.get_param("file", None) if file is None: resp.status = falcon.HTTP_400 resp.body = f"File not found in your request." else: file_contents = file.file.read() # binary string filename = file.filename pdf_save_location = self.pdf_store.save_pdf_binary_string( pdf_string=file_contents, out_filename=filename ) # run pdfbox to get the lines from the file # running the jar file .. need to find a better solution text = subprocess.run( [ "java", "-jar", self.pdfbox_jar_path, "ExtractText", "-console", pdf_save_location, ], stdout=subprocess.PIPE, ) text = text.stdout text = str(text) lines = text.split("\\n") print(f"first line {lines[0]}") labels = [] for batch_lines in chunks(lines, 64): label = self.infer_client.infer_batch(lines=batch_lines) labels.append(label) labels = itertools.chain.from_iterable(labels) labels = list(labels) resp.media = {"labels": labels, "lines": lines} resp.status = falcon.HTTP_201
def process_pdf(file: UploadFile = File(None)): """ Classifies every line in the document to the logical section of the document. The logical section can be title, author, email, section header, subsection header etc Parameters ---------- file : File The Bytestream of a file to be uploaded Returns ------- JSON ``{"labels": [(line, label)]}`` """ global sectlabel_model if sectlabel_model is None: sectlabel_model = SectLabel() file_handle = file.file file_name = file.filename file_contents = file_handle.read() pdf_save_location = pdf_store.save_pdf_binary_string( pdf_string=file_contents, out_filename=file_name) # noinspection PyTypeChecker pdf_reader = PdfReader(filepath=pdf_save_location) # read pdf lines lines = pdf_reader.read_pdf() all_labels = [] all_lines = [] for batch_lines in chunks(lines, 64): labels = sectlabel_model.predict_for_text_batch(texts=batch_lines) all_labels.append(labels) all_lines.append(batch_lines) all_lines = itertools.chain.from_iterable(all_lines) all_lines = list(all_lines) all_labels = itertools.chain.from_iterable(all_labels) all_labels = list(all_labels) response_tuples = [] for line, label in zip(all_lines, all_labels): response_tuples.append((line, label)) # remove the saved pdf pdf_store.delete_file(str(pdf_save_location)) return {"labels": response_tuples}
def process_pdf(file: UploadFile = File(None)): global sectlabel_model if sectlabel_model is None: sectlabel_model = SectLabel() file_handle = file.file file_name = file.filename file_contents = file_handle.read() pdf_save_location = pdf_store.save_pdf_binary_string( pdf_string=file_contents, out_filename=file_name) # noinspection PyTypeChecker pdf_reader = PdfReader(filepath=pdf_save_location) # read pdf lines lines = pdf_reader.read_pdf() all_labels = [] all_lines = [] for batch_lines in chunks(lines, 64): labels = sectlabel_model.predict_for_text_batch(texts=batch_lines) all_labels.append(labels) all_lines.append(batch_lines) all_lines = itertools.chain.from_iterable(all_lines) all_lines = list(all_lines) all_labels = itertools.chain.from_iterable(all_labels) all_labels = list(all_labels) response_tuples = [] for line, label in zip(all_lines, all_labels): response_tuples.append((line, label)) # remove the saved pdf pdf_store.delete_file(str(pdf_save_location)) return {"labels": response_tuples}
def predict_for_pdf(self, pdf_filename: pathlib.Path) -> (List[str], List[str]): """ Predicts lines and labels given a pdf filename Parameters ---------- pdf_filename : pathlib.Path The location where pdf files are stored Returns ------- List[str], List[str] The lines and labels inferred on the file """ pdf_reader = PdfReader(filepath=pdf_filename) lines = pdf_reader.read_pdf() lines = self._preprocess(lines) if len(lines) == 0: self.logger.warning(f"No lines were read from file {pdf_filename}") return "" all_labels = [] all_lines = [] for batch_lines in chunks(lines, 64): labels = self.infer.infer_batch(lines=batch_lines) all_labels.append(labels) all_lines.append(batch_lines) all_lines = itertools.chain.from_iterable(all_lines) all_labels = itertools.chain.from_iterable(all_labels) all_lines = list(all_lines) all_labels = list(all_labels) return all_lines, all_labels
def extract_abstract( self, pdf_filename: pathlib.Path, dehyphenate: bool = True ) -> str: """ Extracts abstracts from a pdf using sectlabel. This is the python programmatic version of the API. The APIs can be found in sciwing/api. You can see that for more information Parameters ---------- pdf_filename : pathlib.Path The path where the pdf is stored dehyphenate : bool Scientific documents are two columns sometimes and there are a lot of hyphenation introduced. If this is true, we remove the hyphens from the code Returns ------- str The abstract of the pdf """ pdf_reader = PdfReader(filepath=pdf_filename) lines = pdf_reader.read_pdf() all_labels = [] all_lines = [] for batch_lines in chunks(lines, 64): labels = self.infer.infer_batch(lines=batch_lines) all_labels.append(labels) all_lines.append(batch_lines) all_lines = itertools.chain.from_iterable(all_lines) all_lines = list(all_lines) all_labels = itertools.chain.from_iterable(all_labels) all_labels = list(all_labels) response_tuples = [] for line, label in zip(all_lines, all_labels): response_tuples.append((line, label)) abstract_lines = [] found_abstract = False for line, label in response_tuples: if label == "sectionHeader" and line.strip().lower() == "abstract": found_abstract = True continue if found_abstract and label == "sectionHeader": break if found_abstract: abstract_lines.append(line.strip()) if dehyphenate: buffer_lines = [] # holds lines that should be a single line final_lines = [] for line in abstract_lines: if line.endswith("-"): line_ = line.replace("-", "") # replace the hyphen buffer_lines.append(line_) else: # if the hyphenation ended on the previous # line then the next line also needs to be # added to the buffer line if len(buffer_lines) > 0: buffer_lines.append(line) line_ = "".join(buffer_lines) # add the line from buffer first final_lines.append(line_) else: # add the current line final_lines.append(line) buffer_lines = [] abstract_lines = final_lines abstract = " ".join(abstract_lines) return abstract
def extract_pdf(file: UploadFile = File(None)): """ Extracts the abstract from a scholarly article Parameters ---------- file : uploadFile Byte Stream of a file uploaded. Returns ------- JSON ``{"abstract": The abstract found in the scholarly document}`` """ global sectlabel_model if sectlabel_model is None: sectlabel_model = SectLabel() file_handle = file.file file_name = file.filename file_contents = file_handle.read() pdf_save_location = pdf_store.save_pdf_binary_string( pdf_string=file_contents, out_filename=file_name) print(f"pdf save location {pdf_save_location}") # noinspection PyTypeChecker pdf_reader = PdfReader(filepath=pdf_save_location) # read pdf lines lines = pdf_reader.read_pdf() all_labels = [] all_lines = [] for batch_lines in chunks(lines, 64): labels = sectlabel_model.predict_for_text_batch(texts=batch_lines) all_labels.append(labels) all_lines.append(batch_lines) all_lines = itertools.chain.from_iterable(all_lines) all_lines = list(all_lines) all_labels = itertools.chain.from_iterable(all_labels) all_labels = list(all_labels) response_tuples = [] for line, label in zip(all_lines, all_labels): response_tuples.append((line, label)) abstract_lines = [] found_abstract = False for line, label in response_tuples: if label == "sectionHeader" and line.strip().lower() == "abstract": found_abstract = True continue if found_abstract and label == "sectionHeader": break if found_abstract: abstract_lines.append(line.strip()) abstract = " ".join(abstract_lines) # remove the saved pdf pdf_store.delete_file(str(pdf_save_location)) return {"abstract": abstract}