def __getPlaintext(self):
        # extract plaintext from pdf
        paper = PdfLib(self.wd + os.sep + self.filename)
        textBeginning = self.__guessDocBegining(self.filename)
        plaintext = paper.pdf2txt(textBeginning, "max")

        # normalize text
        f = Filter(asString=plaintext)
        plaintext = f.substitutions() \
            .oneCharPerLine() \
            .normalizeCaracters() \
            .lower() \
            .uselessCharacters() \
            .multipleDots() \
            .listEnum() \
            .digits() \
            .shortTokens() \
            .multipleSpaces() \
            .getResult()

        # experience shows, that less than 6000 characters is mostly waste
        if len(plaintext) > 6000:
            result = {}
            result[self.langKey] = self.__guessLang(plaintext)
            result[self.plaintextKey] = plaintext
            result[self.filenameKey] = self.filename
            return result
        else:
            raise Exception(u"Document is too short.")
    def __getPlaintext(self):
        # extract plaintext from pdf
        paper = PdfLib(self.wd + os.sep + self.filename)
        textBeginning = self.__guessDocBegining(self.filename)
        plaintext = paper.pdf2txt(textBeginning, "max")

        # normalize text
        f = Filter(asString=plaintext)
        plaintext = (
            f.substitutions()
            .oneCharPerLine()
            .normalizeCaracters()
            .lower()
            .uselessCharacters()
            .multipleDots()
            .listEnum()
            .digits()
            .shortTokens()
            .multipleSpaces()
            .getResult()
        )

        # experience shows, that less than 6000 characters is mostly waste
        if len(plaintext) > 6000:
            result = {}
            result[self.langKey] = self.__guessLang(plaintext)
            result[self.plaintextKey] = plaintext
            result[self.filenameKey] = self.filename
            return result
        else:
            raise Exception(u"Document is too short.")
 def __guessDocBegining(self, filename):
     if os.path.exists(self.wd + os.sep + filename):
         """
         inspect the first 5 pages. when a page consists of more than 1500 characters,
         assume this is the beginning of the text. Those values are based on experience,
         not science ;)
         """
         maxPages = 5
         threshold = 1300
         for p in range(1, maxPages):
             paper = PdfLib(self.wd + os.sep + filename)
             text = paper.pdf2txt(p)
             numChar = len(text)
             textLower = text.lower()
             if numChar > threshold or textLower.find("abstract") != -1 or textLower.find("introduction") != -1:
                 return p
         return maxPages
     else:
         self.logger.info(u"{} does not exist.".format(filename))
 def __guessDocBegining(self, filename):
     if os.path.exists(self.wd + os.sep + filename):
         """
         inspect the first 5 pages. when a page consists of more than 1500 characters,
         assume this is the beginning of the text. Those values are based on experience,
         not science ;)
         """
         maxPages = 5
         threshold = 1300
         for p in range(1, maxPages):
             paper = PdfLib(self.wd + os.sep + filename)
             text = paper.pdf2txt(p)
             numChar = len(text)
             textLower = text.lower()
             if numChar > threshold or textLower.find(
                     "abstract") != -1 or textLower.find(
                         "introduction") != -1:
                 return p
         return maxPages
     else:
         self.logger.info(u"{} does not exist.".format(filename))