Beispiel #1
0
    def __init__(self, item):
        assert not hasattr(item, 'attributes')
        self._item = item
        self.text = re.sub('[^\x20-\x7E]', '', remove_ligatures(unicode(item.get_text()).strip())).encode('utf8')
        self.yoffset = item.yoffset

        self.x0 = item.x0
        self.x1 = item.x1
        self.y0 = item.y0
        self.y1 = item.y0 + item.height  # item.y1 is often unreliable
        assert self.x0 <= self.x1 and self.y0 <= self.y1

        self.height = item.height
        self.width = item.width

        self.style = {}
        self.attributes = {}

        self.abstract = bool(re.findall('^abstract', self.text, flags=re.I))

        self.fontsize = int(item.height)
        self.fontname = 'unknown'

        self.children = [c for c in item if hasattr(c, 'fontname')]
        if self.children:
            # Use height of the character bbox as font size, which might be better
            # because it invariant to font type (but can be worse if it's
            # incorrectly reported by pdfminer).

            # take most frequent font name and size
            self.fontsize = Counter(int(c.height) for c in self.children).most_common()[0][0]
            self.fontname = Counter(c.fontname for c in self.children).most_common()[0][0]
Beispiel #2
0
    def extract_plaintext(self):
        "Extract plaintext from filename. Returns text, might cache."

        if self.cached.endswith('.pdf'):
            # extract text from pdfs
            text = pdftotext(self.cached, output=self.d / 'data' / 'pdftotext.txt',
                             verbose=True, usecached=True)

        else:
            text = robust_read(self.cached)
            text = force_unicode(text)
            text = htmltotext(text)      # clean up html

        text = remove_ligatures(text)

        return self.store('data/text', text, overwrite=True)