def consume(self): for doc in os.listdir(self.CONSUME): doc = os.path.join(self.CONSUME, doc) if not os.path.isfile(doc): continue if not re.match(self.REGEX_TITLE, doc): continue if doc in self._ignore: continue if self._is_ready(doc): continue Log.info("Consuming {}".format(doc), Log.COMPONENT_CONSUMER) tempdir = tempfile.mkdtemp(prefix="paperless", dir=self.SCRATCH) pngs = self._get_greyscale(tempdir, doc) try: text = self._get_ocr(pngs) self._store(text, doc) except OCRError: self._ignore.append(doc) Log.error( "OCR FAILURE: {}".format(doc), Log.COMPONENT_CONSUMER) self._cleanup_tempdir(tempdir) continue else: self._cleanup_tempdir(tempdir) self._cleanup_doc(doc)
def consume(self): for doc in os.listdir(self.CONSUME): doc = os.path.join(self.CONSUME, doc) if not os.path.isfile(doc): continue if not re.match(self.REGEX_TITLE, doc): continue if doc in self._ignore: continue if self._is_ready(doc): continue Log.info("Consuming {}".format(doc), Log.COMPONENT_CONSUMER) tempdir = tempfile.mkdtemp(prefix="paperless", dir=self.SCRATCH) pngs = self._get_greyscale(tempdir, doc) try: text = self._get_ocr(pngs) self._store(text, doc) except OCRError: self._ignore.append(doc) Log.error("OCR FAILURE: {}".format(doc), Log.COMPONENT_CONSUMER) self._cleanup_tempdir(tempdir) continue else: self._cleanup_tempdir(tempdir) self._cleanup_doc(doc)
def consume(self): for doc in os.listdir(self.CONSUME): doc = os.path.join(self.CONSUME, doc) if not os.path.isfile(doc): continue if not re.match(self.REGEX_TITLE, doc): continue if doc in self._ignore: continue if self._is_ready(doc): continue Log.info("Consuming {}".format(doc), Log.COMPONENT_CONSUMER) pngs = self._get_greyscale(doc) try: text = self._get_ocr(pngs) except OCRError: self._ignore.append(doc) Log.error("OCR FAILURE: {}".format(doc), Log.COMPONENT_CONSUMER) continue self._store(text, doc) self._cleanup(pngs, doc)
def parse(self, filename): """ Parses all available information from the given filename. """ filename = os.path.split(filename)[1] try: fmt = self.naming_scheme temp = fmt for tag in tags: temp = temp.replace(tag.tag, tag.re) vars = re.search(temp, filename).groupdict() for tag in [tag for tag in tags if tag.tag in fmt]: vars = tag.relations(vars) if not 'startDatetime' in vars and 'endDatetime' in vars: startdatetime = vars['endDatetime'] - timedelta(self.timespan) vars['startDatetime'] = startdatetime vars['startDate'] = startdatetime.date() vars['startYear'] = startdatetime.strftime('%Y') vars['startMonth'] = startdatetime.strftime('%m') vars['startDay'] = startdatetime.strftime('%d') vars['startTime'] = startdatetime.time() vars['startHour'] = startdatetime.strftime('%H') vars['startMinute'] = startdatetime.strftime('%M') vars['startSecond'] = startdatetime.strftime('%S') elif 'startDatetime' in vars and not 'endDatetime' in vars: enddatetime = vars['startDatetime'] + timedelta(self.timespan) vars['endDatetime'] = enddatetime vars['endDate'] = enddatetime.date() vars['endYear'] = enddatetime.strftime('%Y') vars['endMonth'] = enddatetime.strftime('%m') vars['endDay'] = enddatetime.strftime('%d') vars['endTime'] = enddatetime.time() vars['endHour'] = enddatetime.strftime('%H') vars['endMinute'] = enddatetime.strftime('%M') vars['endSecond'] = enddatetime.strftime('%S') return vars except: Log.error('analysis.models.DataStream.parse', """ An error occurred while attempting to parse filename {} using naming convention {} Check that the naming convention for this data stream has not been changed at the source. """.format(filename, self.naming_scheme))
def _fetch(self): for num in self._connection.search(None, "ALL")[1][0].split(): __, data = self._connection.fetch(num, "(RFC822)") message = None try: message = Message(data[0][1], self.verbosity) except InvalidMessageError as e: Log.error(e, Log.COMPONENT_MAIL) else: self._connection.store(num, "+FLAGS", "\\Deleted") if message: yield message
def fetch_data_ftp(self): from ftplib import FTP import os host_directory = eval(self.host_directory) test = eval(self.file_test) client_subdirectory = eval(self.client_subdirectory) target =os.path.join(settings.STATIC_ROOT, 'data', self.client_directory, client_subdirectory()) ftp = FTP(host=self.host) ftp.login(user=self.user, passwd=self.password) try: ftp.cwd(host_directory()) except ftp_lib.error_perm as e: if e.errno == 550: Log.error('atmospherics.data.models.FTPSource.fetch_data_ftp', """ An error occurred while accessing the directory {} on {}. Try checking the host server to ensure that their naming and filing scheme has not changed. The code used to generate this directory: {} """.format( host_directory(), self.host, self.host_directory )) else: raise data = [] for datafile in ftp.nlst(): if test(datafile): if os.path.exists(os.path.join(target, datafile)) and self.overwrite == False: pass else: data.append(datafile) if not os.path.exists(target): try: os.makedirs(target) except OSError, e: if e.errno !=17: raise pass
def _get_messages(self): r = [] try: self._connect() self._login() for message in self._fetch(): if message: r.append(message) self._connection.expunge() self._connection.close() self._connection.logout() except Exception as e: Log.error(e, Log.COMPONENT_MAIL) return r