Beispiel #1
0
    def _mergeInto(self, key, value):
        """Internal method for baking new values into metadata dictionary.
           Typical `value` values, and desired output include:

             * {"A": ["B", "C"]}  =>  B,C
             * [{"A": "B"}]  => B
             * {'x-default': 'B'} => B
             * ["B", "C"] => B,C
             * "B" => B

        """

        foundValue = value
        if isinstance(value, dict):  # if dict: make list
            foundValue = []
            for (k, v) in value.iteritems():
                v = make_unicode(v)
                foundValue.append(v)
        if isinstance(foundValue, list):  # if list: iterate
            for v in foundValue:
                self._mergeInto(key, v)
            return

        if foundValue:  # foundValue can be None
            foundValue = foundValue.strip()  # by now, foundValue is a string
            foundValue = make_unicode(foundValue)
            if len(foundValue) > 0:
                if key in self.data:
                    if foundValue in self.data[key]:
                        return
                    else:
                        self.data[key].append(foundValue)
                else:
                    self.data[key] = [foundValue]
Beispiel #2
0
 def parse_rules(self, tuple_, header):
     """Parse document rules. See settings.py for syntax"""
     rule_key = tuple_[0].upper()
     rule_val = tuple_[1]
     header = header.upper()
     # --------- Logical separators --------
     if rule_key == "AND":
         hit = True
         for rule in rule_val:
             hit = hit and self.parse_rules(rule, header)
         return hit
     elif rule_key == "OR":
         hit = False
         for rule in rule_val:
             hit = hit or self.parse_rules(rule, header)
         return hit
     elif rule_key == "NOT":
         hit = not self.parse_rules(rule_val, header)
         return hit
     # -------------- Rules ----------------
     elif rule_key == "HEADER_CONTAINS":
         try:
             pos = make_unicode(header).find(rule_val.upper())
         except UnicodeDecodeError:
             pos = -1
         return pos > -1
Beispiel #3
0
    def _mergeInto(self, key, value):
        """Internal method for baking new values into metadata dictionary.
           Typical `value` values, and desired output include:

             * {"A": ["B", "C"]}  =>  B,C
             * [{"A": "B"}]  => B
             * {'x-default': 'B'} => B
             * ["B", "C"] => B,C
             * "B" => B

        """

        foundValue = value
        if isinstance(value, dict):  # if dict: make list
            foundValue = []
            for (k, v) in value.iteritems():
                # This value could be encoded anyhow. Really.
                # Let's try and find one that makes sense.
                v = make_unicode(v)
                foundValue.append(v)
        if isinstance(foundValue, list):  # if list: iterate
            for v in foundValue:
                self._mergeInto(key, v)
            return

        if foundValue:  # foundValue can be None
            foundValue = foundValue.strip()  # by now, foundValue is a string
            foundValue = make_unicode(foundValue)
            if len(foundValue) > 0:
                if key in self.data:
                    if foundValue in self.data[key]:
                        return
                    else:
                        self.data[key].append(foundValue)
                else:
                    self.data[key] = [foundValue]
Beispiel #4
0
    def get_text(self):
        """Iterate through the list of LT* objects and capture all text.

           Text is cached on first call.
        """
        try:
            return self._text_cache
        except AttributeError:  # cache is not there
            pass

        text_content = []
        for lt_obj in self.LTPage:
            text_content.append(self._parse_obj(lt_obj))
        try:
            self._text_cache = u'\n'.join(text_content)
        except UnicodeDecodeError:
            self._text_cache = u'\n'.join(make_unicode(x) for x in text_content)

        return self._text_cache
Beispiel #5
0
    def get_text(self):
        """Returns all text content from the PDF as plain text.
        """
        try:
            return self._text_cache
        except AttributeError:  # not cached
            pass

        text_list = []
        for page in self.get_next_page():
            text = page.get_text()
            if text.strip() != "":
                text_list.append(text)
        try:
            self._text_cache = u'\n'.join(text_list)
        except UnicodeDecodeError:
            self._text_cache = u'\n'.join(make_unicode(x) for x in text_list)

        return self._text_cache
Beispiel #6
0
    def get_text(self):
        """Returns all text content from the PDF as plain text.
        """
        try:
            return self._text_cache
        except AttributeError:  # not cached
            pass

        text_list = []
        for page in self.get_next_page():
            text = page.get_text()
            if text.strip() != "":
                text_list.append(text)
        try:
            self._text_cache = u'\n'.join(text_list)
        except UnicodeDecodeError:
            self._text_cache = u'\n'.join(make_unicode(x) for x in text_list)

        return self._text_cache
Beispiel #7
0
    def get_text(self):
        """Iterate through the list of LT* objects and capture all text.

           Text is cached on first call.
        """
        try:
            return self._text_cache
        except AttributeError:  # cache is not there
            pass

        text_content = []
        for lt_obj in self.LTPage:
            text_content.append(self._parse_obj(lt_obj))
        try:
            self._text_cache = u'\n'.join(text_content)
        except UnicodeDecodeError:
            self._text_cache = u'\n'.join(
                make_unicode(x) for x in text_content)

        return self._text_cache
Beispiel #8
0
    def get_header(self):
        """Tries to guess what text belongs to the page header.
        """
        try:
            return self._header_cache
        except AttributeError:  # cache is not there
            pass

        page_bbox = self.LTPage.bbox
        top_fifth = (page_bbox[3] - page_bbox[0]) * 0.8

        # Get all objects containing text
        all_objects = []
        for obj in self.LTPage:
            obj_text = self._parse_obj(obj, do_ocr=True)
            y1 = obj.bbox[1]
            all_objects.append((y1, obj_text))
        from operator import itemgetter
        all_objects.sort(key=itemgetter(0), reverse=True)

        # Get what looks most like a header
        header_texts = []
        i = 0
        for (y1, obj_text) in all_objects:
            text_length = len(obj_text.strip())
            if text_length > 100:  # break on first paragraph
                break
            elif i > 5:  # or break on 8th object with content
                break
            elif y1 < top_fifth:  # TODO break on 1/5 of page
                break
            else:
                header_texts.append(obj_text)
            if text_length > 0:
                i += 1
        try:
            self._header_cache = u' '.join(header_texts)
        except UnicodeDecodeError:
            self._header_cache = u' '.join(
                make_unicode(x) for x in header_texts)
        return self._header_cache
Beispiel #9
0
    def get_header(self):
        """Tries to guess what text belongs to the page header.
        """
        try:
            return self._header_cache
        except AttributeError:  # cache is not there
            pass

        page_bbox = self.LTPage.bbox
        top_fifth = (page_bbox[3] - page_bbox[0]) * 0.8

        # Get all objects containing text
        all_objects = []
        for obj in self.LTPage:
            obj_text = self._parse_obj(obj, do_ocr=True)
            y1 = obj.bbox[1]
            all_objects.append((y1, obj_text))
        from operator import itemgetter
        all_objects.sort(key=itemgetter(0), reverse=True)

        # Get what looks most like a header
        header_texts = []
        i = 0
        for (y1, obj_text) in all_objects:
            text_length = len(obj_text.strip())
            if text_length > 100:  # break on first paragraph
                break
            elif i > 5:  # or break on 8th object with content
                break
            elif y1 < top_fifth:  # TODO break on 1/5 of page
                break
            else:
                header_texts.append(obj_text)
            if text_length > 0:
                i += 1
        try:
            self._header_cache = u' '.join(header_texts)
        except UnicodeDecodeError:
            self._header_cache = u' '.join(make_unicode(x) for x in header_texts)
        return self._header_cache
Beispiel #10
0
 def merge_with(self, document):
     """Merge this document with another one"""
     try:
         self.text += document.text
     except UnicodeDecodeError:
         self.text = make_unicode(self.text) + make_unicode(document.text)