Esempio n. 1
0
    def consume(self):

        for doc in os.listdir(self.CONSUME):

            doc = os.path.join(self.CONSUME, doc)

            if not os.path.isfile(doc):
                continue

            if not re.match(self.REGEX_TITLE, doc):
                continue

            if doc in self._ignore:
                continue

            if self._is_ready(doc):
                continue

            Log.info("Consuming {}".format(doc), Log.COMPONENT_CONSUMER)

            tempdir = tempfile.mkdtemp(prefix="paperless", dir=self.SCRATCH)
            pngs = self._get_greyscale(tempdir, doc)

            try:
                text = self._get_ocr(pngs)
                self._store(text, doc)
            except OCRError:
                self._ignore.append(doc)
                Log.error(
                    "OCR FAILURE: {}".format(doc), Log.COMPONENT_CONSUMER)
                self._cleanup_tempdir(tempdir)
                continue
            else:
                self._cleanup_tempdir(tempdir)
                self._cleanup_doc(doc)
Esempio n. 2
0
    def consume(self):

        for doc in os.listdir(self.CONSUME):

            doc = os.path.join(self.CONSUME, doc)

            if not os.path.isfile(doc):
                continue

            if not re.match(self.REGEX_TITLE, doc):
                continue

            if doc in self._ignore:
                continue

            if self._is_ready(doc):
                continue

            Log.info("Consuming {}".format(doc), Log.COMPONENT_CONSUMER)

            tempdir = tempfile.mkdtemp(prefix="paperless", dir=self.SCRATCH)
            pngs = self._get_greyscale(tempdir, doc)

            try:
                text = self._get_ocr(pngs)
                self._store(text, doc)
            except OCRError:
                self._ignore.append(doc)
                Log.error("OCR FAILURE: {}".format(doc),
                          Log.COMPONENT_CONSUMER)
                self._cleanup_tempdir(tempdir)
                continue
            else:
                self._cleanup_tempdir(tempdir)
                self._cleanup_doc(doc)
Esempio n. 3
0
    def consume(self):

        for doc in os.listdir(self.CONSUME):

            doc = os.path.join(self.CONSUME, doc)

            if not os.path.isfile(doc):
                continue

            if not re.match(self.REGEX_TITLE, doc):
                continue

            if doc in self._ignore:
                continue

            if self._is_ready(doc):
                continue

            Log.info("Consuming {}".format(doc), Log.COMPONENT_CONSUMER)

            pngs = self._get_greyscale(doc)

            try:
                text = self._get_ocr(pngs)
            except OCRError:
                self._ignore.append(doc)
                Log.error("OCR FAILURE: {}".format(doc), Log.COMPONENT_CONSUMER)
                continue

            self._store(text, doc)
            self._cleanup(pngs, doc)
Esempio n. 4
0
    def parse(self, filename):
        """
        Parses all available information from the given filename.
        """
        filename = os.path.split(filename)[1]
        try:
            fmt = self.naming_scheme
            temp = fmt
            for tag in tags:
                temp = temp.replace(tag.tag, tag.re)

            vars = re.search(temp, filename).groupdict()
            
            for tag in [tag for tag in tags if tag.tag in fmt]:
                vars = tag.relations(vars)
                
            if not 'startDatetime' in vars and 'endDatetime' in vars:
                startdatetime = vars['endDatetime'] - timedelta(self.timespan)
                
                vars['startDatetime'] = startdatetime
                vars['startDate'] = startdatetime.date()
                vars['startYear'] = startdatetime.strftime('%Y')
                vars['startMonth'] = startdatetime.strftime('%m')
                vars['startDay'] = startdatetime.strftime('%d')
                
                vars['startTime'] = startdatetime.time()
                vars['startHour'] = startdatetime.strftime('%H')
                vars['startMinute'] = startdatetime.strftime('%M')
                vars['startSecond'] = startdatetime.strftime('%S')
                
            elif 'startDatetime' in vars and not 'endDatetime' in vars:
                enddatetime = vars['startDatetime'] + timedelta(self.timespan)
                
                vars['endDatetime'] = enddatetime
                vars['endDate'] = enddatetime.date()
                vars['endYear'] = enddatetime.strftime('%Y')
                vars['endMonth'] = enddatetime.strftime('%m')
                vars['endDay'] = enddatetime.strftime('%d')
                
                vars['endTime'] = enddatetime.time()
                vars['endHour'] = enddatetime.strftime('%H')
                vars['endMinute'] = enddatetime.strftime('%M')
                vars['endSecond'] = enddatetime.strftime('%S')
                
            return vars
        except:
            Log.error('analysis.models.DataStream.parse', """
                An error occurred while attempting to parse filename {} using naming convention {}
                Check that the naming convention for this data stream has not been changed at the source.
            """.format(filename, self.naming_scheme))
Esempio n. 5
0
    def _fetch(self):

        for num in self._connection.search(None, "ALL")[1][0].split():

            __, data = self._connection.fetch(num, "(RFC822)")

            message = None
            try:
                message = Message(data[0][1], self.verbosity)
            except InvalidMessageError as e:
                Log.error(e, Log.COMPONENT_MAIL)
            else:
                self._connection.store(num, "+FLAGS", "\\Deleted")

            if message:
                yield message
Esempio n. 6
0
    def _fetch(self):

        for num in self._connection.search(None, "ALL")[1][0].split():

            __, data = self._connection.fetch(num, "(RFC822)")

            message = None
            try:
                message = Message(data[0][1], self.verbosity)
            except InvalidMessageError as e:
                Log.error(e, Log.COMPONENT_MAIL)
            else:
                self._connection.store(num, "+FLAGS", "\\Deleted")

            if message:
                yield message
Esempio n. 7
0
    def fetch_data_ftp(self):
        from ftplib import FTP
        import os

        host_directory = eval(self.host_directory)
        test = eval(self.file_test)
        client_subdirectory = eval(self.client_subdirectory)
        target =os.path.join(settings.STATIC_ROOT, 'data', self.client_directory, client_subdirectory())

        ftp = FTP(host=self.host)
        ftp.login(user=self.user, passwd=self.password)
        try:
            ftp.cwd(host_directory())
        except ftp_lib.error_perm as e:
            if e.errno == 550:
                Log.error('atmospherics.data.models.FTPSource.fetch_data_ftp', """
                    An error occurred while accessing the directory {} on {}.
                    
                    Try checking the host server to ensure that their naming and filing scheme has not changed.
                    
                    The code used to generate this directory: {}
                """.format( host_directory(), self.host, self.host_directory ))
            else:
                raise
            

        data = []
        for datafile in ftp.nlst():
            if test(datafile):
                if os.path.exists(os.path.join(target, datafile)) and self.overwrite == False:
                    pass
                else:
                    data.append(datafile)

        
        if not os.path.exists(target): 
            try: 
                os.makedirs(target)
            except OSError, e:
                if e.errno !=17:
                    raise
                pass
Esempio n. 8
0
    def _get_messages(self):

        r = []
        try:

            self._connect()
            self._login()

            for message in self._fetch():
                if message:
                    r.append(message)

            self._connection.expunge()
            self._connection.close()
            self._connection.logout()

        except Exception as e:
            Log.error(e, Log.COMPONENT_MAIL)

        return r
Esempio n. 9
0
    def _get_messages(self):

        r = []
        try:

            self._connect()
            self._login()

            for message in self._fetch():
                if message:
                    r.append(message)

            self._connection.expunge()
            self._connection.close()
            self._connection.logout()

        except Exception as e:
            Log.error(e, Log.COMPONENT_MAIL)

        return r