def test_embedding_dimensions(self, setup_embedder):
     embedder, data_manager = setup_embedder
     train_dataset = data_manager.train_dataset
     lines, labels = train_dataset.get_lines_labels()
     for lines_batch in chunks(lines, 10):
         embedding = embedder(lines_batch)
         assert embedding.dim() == 3
 def test_returns_float_tensors(self, setup_embedder):
     embedder, data_manager = setup_embedder
     train_dataset = data_manager.train_dataset
     lines, labels = train_dataset.get_lines_labels()
     for lines_batch in chunks(lines, 10):
         embedding = embedder(lines_batch)
         assert isinstance(embedding, torch.FloatTensor)
Beispiel #3
0
    def on_post(self, req, resp) -> Dict[str, Any]:
        """ Post the base64 url encoded pdf file to sect label.
        This converts the base64 encoded string to pdf. Reads the
        pdf line by line and returns the logical section for every line

        Returns
        -------
        Dict[str, Any]
            Return the lines with corresponding labels to the client
            ``{"labels": [], "lines": []}``
        """
        if self.infer_client is None:
            self.infer_client = self.model_infer_func(self.model_filepath)

        file = req.get_param("file", None)
        if file is None:
            resp.status = falcon.HTTP_400
            resp.body = f"File not found in your request."

        else:
            file_contents = file.file.read()  # binary string
            filename = file.filename
            pdf_save_location = self.pdf_store.save_pdf_binary_string(
                pdf_string=file_contents, out_filename=filename
            )

            # run pdfbox to get the lines from the file
            # running the jar file .. need to find a better solution
            text = subprocess.run(
                [
                    "java",
                    "-jar",
                    self.pdfbox_jar_path,
                    "ExtractText",
                    "-console",
                    pdf_save_location,
                ],
                stdout=subprocess.PIPE,
            )
            text = text.stdout
            text = str(text)
            lines = text.split("\\n")

            print(f"first line {lines[0]}")
            labels = []
            for batch_lines in chunks(lines, 64):
                label = self.infer_client.infer_batch(lines=batch_lines)
                labels.append(label)

            labels = itertools.chain.from_iterable(labels)
            labels = list(labels)

            resp.media = {"labels": labels, "lines": lines}

        resp.status = falcon.HTTP_201
Beispiel #4
0
def process_pdf(file: UploadFile = File(None)):
    """ Classifies every line in the document to the logical section of the document. The logical
    section can be title, author, email, section header, subsection header etc

    Parameters
    ----------
    file : File
        The Bytestream of a file to be uploaded

    Returns
    -------
    JSON
        ``{"labels": [(line, label)]}``

    """
    global sectlabel_model
    if sectlabel_model is None:
        sectlabel_model = SectLabel()

    file_handle = file.file
    file_name = file.filename
    file_contents = file_handle.read()

    pdf_save_location = pdf_store.save_pdf_binary_string(
        pdf_string=file_contents, out_filename=file_name)

    # noinspection PyTypeChecker
    pdf_reader = PdfReader(filepath=pdf_save_location)

    # read pdf lines
    lines = pdf_reader.read_pdf()
    all_labels = []
    all_lines = []

    for batch_lines in chunks(lines, 64):
        labels = sectlabel_model.predict_for_text_batch(texts=batch_lines)
        all_labels.append(labels)
        all_lines.append(batch_lines)

    all_lines = itertools.chain.from_iterable(all_lines)
    all_lines = list(all_lines)

    all_labels = itertools.chain.from_iterable(all_labels)
    all_labels = list(all_labels)

    response_tuples = []
    for line, label in zip(all_lines, all_labels):
        response_tuples.append((line, label))

    # remove the saved pdf
    pdf_store.delete_file(str(pdf_save_location))

    return {"labels": response_tuples}
Beispiel #5
0
def process_pdf(file: UploadFile = File(None)):
    global sectlabel_model
    if sectlabel_model is None:
        sectlabel_model = SectLabel()

    file_handle = file.file
    file_name = file.filename
    file_contents = file_handle.read()

    pdf_save_location = pdf_store.save_pdf_binary_string(
        pdf_string=file_contents, out_filename=file_name)

    # noinspection PyTypeChecker
    pdf_reader = PdfReader(filepath=pdf_save_location)

    # read pdf lines
    lines = pdf_reader.read_pdf()
    all_labels = []
    all_lines = []

    for batch_lines in chunks(lines, 64):
        labels = sectlabel_model.predict_for_text_batch(texts=batch_lines)
        all_labels.append(labels)
        all_lines.append(batch_lines)

    all_lines = itertools.chain.from_iterable(all_lines)
    all_lines = list(all_lines)

    all_labels = itertools.chain.from_iterable(all_labels)
    all_labels = list(all_labels)

    response_tuples = []
    for line, label in zip(all_lines, all_labels):
        response_tuples.append((line, label))

    # remove the saved pdf
    pdf_store.delete_file(str(pdf_save_location))

    return {"labels": response_tuples}
Beispiel #6
0
    def predict_for_pdf(self,
                        pdf_filename: pathlib.Path) -> (List[str], List[str]):
        """ Predicts lines and labels given a pdf filename

        Parameters
        ----------
        pdf_filename : pathlib.Path
            The location where pdf files are stored

        Returns
        -------
        List[str], List[str]
            The lines and labels inferred on the file
        """
        pdf_reader = PdfReader(filepath=pdf_filename)
        lines = pdf_reader.read_pdf()

        lines = self._preprocess(lines)

        if len(lines) == 0:
            self.logger.warning(f"No lines were read from file {pdf_filename}")
            return ""

        all_labels = []
        all_lines = []

        for batch_lines in chunks(lines, 64):
            labels = self.infer.infer_batch(lines=batch_lines)
            all_labels.append(labels)
            all_lines.append(batch_lines)

        all_lines = itertools.chain.from_iterable(all_lines)
        all_labels = itertools.chain.from_iterable(all_labels)
        all_lines = list(all_lines)
        all_labels = list(all_labels)

        return all_lines, all_labels
Beispiel #7
0
    def extract_abstract(
        self, pdf_filename: pathlib.Path, dehyphenate: bool = True
    ) -> str:
        """ Extracts abstracts from a pdf using sectlabel. This is the python programmatic version of
        the API. The APIs can be found in sciwing/api. You can see that for more information

        Parameters
        ----------
        pdf_filename : pathlib.Path
            The path where the pdf is stored
        dehyphenate : bool
            Scientific documents are two columns sometimes and there are a lot of hyphenation
            introduced. If this is true, we remove the hyphens from the code

        Returns
        -------
        str
            The abstract of the pdf

        """
        pdf_reader = PdfReader(filepath=pdf_filename)
        lines = pdf_reader.read_pdf()
        all_labels = []
        all_lines = []

        for batch_lines in chunks(lines, 64):
            labels = self.infer.infer_batch(lines=batch_lines)
            all_labels.append(labels)
            all_lines.append(batch_lines)

        all_lines = itertools.chain.from_iterable(all_lines)
        all_lines = list(all_lines)

        all_labels = itertools.chain.from_iterable(all_labels)
        all_labels = list(all_labels)

        response_tuples = []
        for line, label in zip(all_lines, all_labels):
            response_tuples.append((line, label))

        abstract_lines = []
        found_abstract = False
        for line, label in response_tuples:
            if label == "sectionHeader" and line.strip().lower() == "abstract":
                found_abstract = True
                continue
            if found_abstract and label == "sectionHeader":
                break
            if found_abstract:
                abstract_lines.append(line.strip())

        if dehyphenate:
            buffer_lines = []  # holds lines that should be a single line
            final_lines = []
            for line in abstract_lines:
                if line.endswith("-"):
                    line_ = line.replace("-", "")  # replace the hyphen
                    buffer_lines.append(line_)
                else:

                    # if the hyphenation ended on the previous
                    # line then the next line also needs to be
                    # added to the buffer line
                    if len(buffer_lines) > 0:
                        buffer_lines.append(line)

                        line_ = "".join(buffer_lines)

                        # add the line from buffer first
                        final_lines.append(line_)

                    else:
                        # add the current line
                        final_lines.append(line)

                    buffer_lines = []

            abstract_lines = final_lines

        abstract = " ".join(abstract_lines)
        return abstract
Beispiel #8
0
def extract_pdf(file: UploadFile = File(None)):
    """ Extracts the abstract from a scholarly article

    Parameters
    ----------
    file : uploadFile
        Byte Stream of a file uploaded.

    Returns
    -------
    JSON
        ``{"abstract": The abstract found in the scholarly document}``

    """

    global sectlabel_model
    if sectlabel_model is None:
        sectlabel_model = SectLabel()

    file_handle = file.file
    file_name = file.filename
    file_contents = file_handle.read()

    pdf_save_location = pdf_store.save_pdf_binary_string(
        pdf_string=file_contents, out_filename=file_name)

    print(f"pdf save location {pdf_save_location}")

    # noinspection PyTypeChecker
    pdf_reader = PdfReader(filepath=pdf_save_location)

    # read pdf lines
    lines = pdf_reader.read_pdf()
    all_labels = []
    all_lines = []

    for batch_lines in chunks(lines, 64):
        labels = sectlabel_model.predict_for_text_batch(texts=batch_lines)
        all_labels.append(labels)
        all_lines.append(batch_lines)

    all_lines = itertools.chain.from_iterable(all_lines)
    all_lines = list(all_lines)

    all_labels = itertools.chain.from_iterable(all_labels)
    all_labels = list(all_labels)

    response_tuples = []
    for line, label in zip(all_lines, all_labels):
        response_tuples.append((line, label))

    abstract_lines = []
    found_abstract = False
    for line, label in response_tuples:
        if label == "sectionHeader" and line.strip().lower() == "abstract":
            found_abstract = True
            continue
        if found_abstract and label == "sectionHeader":
            break
        if found_abstract:
            abstract_lines.append(line.strip())

    abstract = " ".join(abstract_lines)

    # remove the saved pdf
    pdf_store.delete_file(str(pdf_save_location))

    return {"abstract": abstract}