def __init__(self):
     LTTextContainer.__init__(self)
     self.start_row = -1
     self.end_row = -1
     self.start_col = -1
     self.end_col = -1
     self.children = []
     self.father = None
Beispiel #2
0
    def callback(container: LTTextContainer):
        """Checks whether an container includes the provided text."""
        if not hasattr(container, 'get_text'):
            raise TypeError

        text = container.get_text().strip().replace('\n', '')
        return all(map(lambda s: s in text, strings))
Beispiel #3
0
def group(characters, delta=5, use_h_axis=True):
    """ group characters based on proximity
    
    Args:
        use_h_axis: whether to group characters based on their distance
            horizontally. When False, you group text into lines
    """

    groups = []
    for c in characters:
        obj = None
        for obj in groups:
            if use_h_axis:
                if obj.vdistance(c) < delta and obj.hdistance(c) < delta:
                    break
            else:
                if obj.vdistance(c) < delta:
                    break
            obj = None

        if obj is None:
            obj = LTTextContainer()
            obj.add(c)
            groups.append(obj)
        else:
            obj.add(c)

    return groups
Beispiel #4
0
def merge(lines, delta=5):
    """ merge text elements where they should be in a single group
    
    Some text elements might get placed into separate groups if they are are on
    separate lines, and their lines start at different x-positions. This function
    cleans those up.
    """

    merged = []
    for line in lines:
        elem = list(line)
        for a, b in itertools.combinations(range(len(elem)), 2):
            if elem[a] is None or elem[b] is None:
                continue
            if elem[a].hdistance(elem[b]) < delta:
                elem[a].extend(elem[b])
                elem[b] = None

        line = LTTextContainer()
        line.extend([x for x in elem if x is not None])
        merged.append(line)

    return merged
Beispiel #5
0
 def process_paragraph(self, paragraph: LTTextContainer, index: int,
                       page_containers: List[LTTextContainer]):
     """Process a paragraph. The default add a paragraph to the CorpusDocument"""
     self._result.add_paragraph(paragraph.get_text().strip(), str(index))
Beispiel #6
0
def check_text_is_date(paragraph: LTTextContainer) -> bool:
    """Returns true if the text of a paragraph contains only a date."""
    return re.match(r'\d\d? \w+ \d\d\d\d\s*$',
                    paragraph.get_text().strip()) is not None
Beispiel #7
0
 def callback(element: LTTextContainer):
     """Checks whether an element's text does not match the provided text."""
     if hasattr(element, 'get_text'):
         return text != element.get_text().strip().replace('\n', '')
     else:
         raise TypeError
Beispiel #8
0
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextContainer, LTChar, LTAnno
import os


def label_func():
    return '1. Определение'


labels_txt = open('labels.txt', 'w')

for page_layout in extract_pages("text1.pdf"):
    for element in page_layout:
        if isinstance(element, LTTextContainer):
            for text_line in element:
                if isinstance(element, LTTextContainer):
                    if str(LTTextContainer.get_text(
                            self=text_line)).startswith(label_func()):
                        print(str(LTTextContainer.get_text(self=text_line)))