Esempio n. 1
0
    def consume(self):

        for doc in os.listdir(self.CONSUME):

            doc = os.path.join(self.CONSUME, doc)

            if not os.path.isfile(doc):
                continue

            if not re.match(self.REGEX_TITLE, doc):
                continue

            if doc in self._ignore:
                continue

            if self._is_ready(doc):
                continue

            Log.info("Consuming {}".format(doc), Log.COMPONENT_CONSUMER)

            tempdir = tempfile.mkdtemp(prefix="paperless", dir=self.SCRATCH)
            pngs = self._get_greyscale(tempdir, doc)

            try:
                text = self._get_ocr(pngs)
                self._store(text, doc)
            except OCRError:
                self._ignore.append(doc)
                Log.error("OCR FAILURE: {}".format(doc),
                          Log.COMPONENT_CONSUMER)
                self._cleanup_tempdir(tempdir)
                continue
            else:
                self._cleanup_tempdir(tempdir)
                self._cleanup_doc(doc)
Esempio n. 2
0
    def consume(self):

        for doc in os.listdir(self.CONSUME):

            doc = os.path.join(self.CONSUME, doc)

            if not os.path.isfile(doc):
                continue

            if not re.match(self.REGEX_TITLE, doc):
                continue

            if doc in self._ignore:
                continue

            if self._is_ready(doc):
                continue

            Log.info("Consuming {}".format(doc), Log.COMPONENT_CONSUMER)

            tempdir = tempfile.mkdtemp(prefix="paperless", dir=self.SCRATCH)
            pngs = self._get_greyscale(tempdir, doc)

            try:
                text = self._get_ocr(pngs)
                self._store(text, doc)
            except OCRError:
                self._ignore.append(doc)
                Log.error(
                    "OCR FAILURE: {}".format(doc), Log.COMPONENT_CONSUMER)
                self._cleanup_tempdir(tempdir)
                continue
            else:
                self._cleanup_tempdir(tempdir)
                self._cleanup_doc(doc)
Esempio n. 3
0
    def fetch_data_math(self):
        ssh=pysftp.Connection(settings.SSH_HOST, username=settings.SSH_USER, password=settings.SSH_PASSWORD)
        
        temp = ssh.execute('mktemp -d')[0].rstrip('\n')
        ssh.chdir(temp)
        
        code, matches = self.replace_exports(temp)
        
        Log.debug('atmospherics.data.models.MathematicaSource.fetch_data', code)
        code = code.replace("'", '\'"\'"\'')
        command = "echo '{}' > {}/package.m".format(code, temp)
        ssh.execute(command)

        ret = ssh.execute('xvfb-run -s "-screen 0 640x480x24" math -script {}/package.m &\n\n\n\n'.format(temp))
        
        if ret:
            message = 'A message was returned by mathematica script  {}.m:\n{}'.format(self.name, ret[-100:])
            Log.info('atmospherics.data.models.MathematicaSource.fetch_data', message)

        ssh.execute('rm {}'.format(os.path.join(temp, 'package.m')))
        
        client_subdirectory = eval(self.client_subdirectory)
        target =os.path.join(settings.STATIC_ROOT, 'data', self.client_directory, client_subdirectory())
        if not os.path.exists(target):
            os.makedirs(target)
        ssh.get_d(temp, target)
        
        #ssh.execute('rm -rf {}'.format(temp))
        ssh.execute('disown')
        ssh.close()

        message = 'MathematicaSource {} run.\nOutput saved to:\nhttp://atmospherics.lossofgenerality.com/{}'.format(self.name, target)
        Log.info('atmospherics.data.models.MathematicaSource.fetch_data', message)
Esempio n. 4
0
    def pull(self):
        """
        Fetch all available mail at the target address and store it locally in
        the consumption directory so that the file consumer can pick it up and
        do its thing.
        """

        if self._enabled:

            Log.info("Checking mail", Log.COMPONENT_MAIL)

            for message in self._get_messages():

                Log.debug(
                    'Storing email: "{}"'.format(message.subject),
                    Log.COMPONENT_MAIL
                )

                t = int(time.mktime(message.time.timetuple()))
                file_name = os.path.join(Consumer.CONSUME, message.file_name)
                with open(file_name, "wb") as f:
                    f.write(message.attachment.data)
                    os.utime(file_name, times=(t, t))

        self.last_checked = datetime.datetime.now()
Esempio n. 5
0
    def consume(self):

        for doc in os.listdir(self.CONSUME):

            doc = os.path.join(self.CONSUME, doc)

            if not os.path.isfile(doc):
                continue

            if not re.match(self.REGEX_TITLE, doc):
                continue

            if doc in self._ignore:
                continue

            if self._is_ready(doc):
                continue

            Log.info("Consuming {}".format(doc), Log.COMPONENT_CONSUMER)

            pngs = self._get_greyscale(doc)

            try:
                text = self._get_ocr(pngs)
            except OCRError:
                self._ignore.append(doc)
                Log.error("OCR FAILURE: {}".format(doc), Log.COMPONENT_CONSUMER)
                continue

            self._store(text, doc)
            self._cleanup(pngs, doc)
Esempio n. 6
0
def mathematica_session(math_session, extra_args, user):
    """
    Runs the given Math session on the Thorek01 server.
    """
    ssh=pysftp.Connection(settings.SSH_HOST, username=settings.SSH_USER, password=settings.SSH_PASSWORD)

    if ssh.execute('pidof MathKernel') or ssh.execute('pidof Mathematica'):
        ssh.close()
        mathematica_session.retry(countdown=5*60, max_retries=(60/5)*24)
    
    temp = ssh.execute('mktemp -d')[0].rstrip('\n')
    ssh.chdir(temp)
    
    code, matches = math_session.replace_exports(temp)
        
    code = code.replace('{extra_args}', extra_args['extra_args'])
    code = code.replace('{data}', extra_args['data'])
    
    Log.debug('atmospherics.analysis.tasks.mathematica_session', code)
    code = code.replace("'", '\'"\'"\'')
    command = "echo '{}' > {}/package.m".format(code, temp)
    ssh.execute(command)

    ret = ssh.execute('xvfb-run -s "-screen 0 640x480x24" math -script {}/package.m &\n\n\n\n'.format(temp))
    
    target = os.path.join(settings.MEDIA_ROOT, user.username, 'output', math_session.name.replace(' ', '_')+datetime.now().strftime('_%m%d%y_%H%M'))
    
    os.makedirs(target)
    ssh.get_d(temp, target)
    
    if ret:
        message = '''
            A message was returned by mathematica script  {}.m:\n
            (trimmed to contain only the last 100 lines) \n\n
            {}
        '''.format(math_session.name, '\n'.join(ret[-100:]))
        Log.info('atmospherics.analysis.tasks.mathematica_session', message)
        
        with open(os.path.join(target, 'response.txt'), 'w') as logfile:
            logfile.write(message)
    
    #ssh.execute('rm -rf {}'.format(temp))
    ssh.execute('disown')
    ssh.close()

    message = 'Mathematica session {} run.\nOutput saved to:\nhttp://atmospherics.lossofgenerality.com/{}'.format(math_session.name, os.path.join('output', user.username, os.path.split(target)[1]))
    Log.info('atmospherics.analysis.tasks.mathematica_session', message)
    if hasattr(user, 'email'):
        subject = 'Atmospherics Mathematica {} Complete'.format(math_session.name)
        from_email = 'Atmospherics<*****@*****.**>'
        email = EmailMultiAlternatives(subject,
                                       message,
                                       from_email,
                                       [user.email])
        email.send()
Esempio n. 7
0
    def __init__(self, data, verbosity=1):
        """
        Cribbed heavily from
        https://www.ianlewis.org/en/parsing-email-attachments-python
        """

        self.verbosity = verbosity

        self.subject = None
        self.time = None
        self.attachment = None

        message = BytesParser(policy=policy.default).parsebytes(data)
        self.subject = str(message["Subject"]).replace("\r\n", "")
        self.body = str(message.get_body())

        self.check_subject()
        self.check_body()

        self._set_time(message)

        Log.info(
            'Importing email: "{}"'.format(self.subject), Log.COMPONENT_MAIL)

        attachments = []
        for part in message.walk():

            content_disposition = part.get("Content-Disposition")
            if not content_disposition:
                continue

            dispositions = content_disposition.strip().split(";")
            if not dispositions[0].lower() == "attachment":
                continue

            file_data = part.get_payload()

            attachments.append(Attachment(
                b64decode(file_data), content_type=part.get_content_type()))

        if len(attachments) == 0:
            raise InvalidMessageError(
                "There don't appear to be any attachments to this message")

        if len(attachments) > 1:
            raise InvalidMessageError(
                "There's more than one attachment to this message. It cannot "
                "be indexed automatically."
            )

        self.attachment = attachments[0]
Esempio n. 8
0
    def __init__(self, data, verbosity=1):
        """
        Cribbed heavily from
        https://www.ianlewis.org/en/parsing-email-attachments-python
        """

        self.verbosity = verbosity

        self.subject = None
        self.time = None
        self.attachment = None

        message = BytesParser(policy=policy.default).parsebytes(data)
        self.subject = str(message["Subject"]).replace("\r\n", "")
        self.body = str(message.get_body())

        self.check_subject()
        self.check_body()

        self._set_time(message)

        Log.info('Importing email: "{}"'.format(self.subject),
                 Log.COMPONENT_MAIL)

        attachments = []
        for part in message.walk():

            content_disposition = part.get("Content-Disposition")
            if not content_disposition:
                continue

            dispositions = content_disposition.strip().split(";")
            if not dispositions[0].lower() == "attachment":
                continue

            file_data = part.get_payload()

            attachments.append(
                Attachment(b64decode(file_data),
                           content_type=part.get_content_type()))

        if len(attachments) == 0:
            raise InvalidMessageError(
                "There don't appear to be any attachments to this message")

        if len(attachments) > 1:
            raise InvalidMessageError(
                "There's more than one attachment to this message. It cannot "
                "be indexed automatically.")

        self.attachment = attachments[0]
Esempio n. 9
0
def mathematica_package(math_package, extra_args, user):
    """
    Runs the given Math package on the Thorek01 server.
    """
    ssh=pysftp.Connection(settings.SSH_HOST, username=settings.SSH_USER, password=settings.SSH_PASSWORD)
    
    temp = ssh.execute('mktemp -d')[0].rstrip('\n')
    ssh.chdir(temp)
    
    code, matches = math_package.replace_exports(temp)
        
    code = code.replace('{extra_args}', extra_args['extra_args'])
    code = code.replace('{data}', extra_args['data'])
    
    Log.debug('atmospherics.analysis.tasks.mathematica_package', code)
    code = code.replace("'", '\'"\'"\'')
    command = "echo '{}' > {}/package.m".format(code, temp)
    ssh.execute(command)

    ret = ssh.execute('xvfb-run -s "-screen 0 640x480x24" math -script {}/package.m &\n\n\n\n'.format(temp))
    
    if ret:
        message = 'A message was returned by mathematica script  {}.m:\n{}'.format(math_package.name, ret[-100:])
        Log.info('atmospherics.analysis.tasks.mathematica_package', message)

    #ssh.execute('rm {}'.format(os.path.join(temp, 'package.m')))
    
    target = os.path.join(settings.MEDIA_ROOT, user.username, 'output', math_package.name.replace(' ', '_')+datetime.now().strftime('_%m%d%y_%H%M'))
    os.makedirs(target)
    ssh.get_d(temp, target)
    
    #ssh.execute('rm -rf {}'.format(temp))
    ssh.execute('disown')
    ssh.close()

    message = 'Mathematica package {} run.\nOutput saved to:\nhttp://atmospherics.lossofgenerality.com/{}'.format(math_package.name, os.path.join('output', user.username, os.path.split(target)[1]))
    Log.info('atmospherics.analysis.tasks.mathematica_package', message)
    if user.email:
        subject = 'Atmospherics Mathematica {} Complete'.format(math_package.name)
        from_email = 'Atmospherics<*****@*****.**>'
        email = EmailMultiAlternatives(subject,
                                       message,
                                       from_email,
                                       [user.email])
        email.send()
Esempio n. 10
0
    def pull(self):
        """
        Fetch all available mail at the target address and store it locally in
        the consumption directory so that the file consumer can pick it up and
        do its thing.
        """

        if self._enabled:

            Log.info("Checking mail", Log.COMPONENT_MAIL)

            for message in self._get_messages():

                Log.debug('Storing email: "{}"'.format(message.subject),
                          Log.COMPONENT_MAIL)

                t = int(time.mktime(message.time.timetuple()))
                file_name = os.path.join(Consumer.CONSUME, message.file_name)
                with open(file_name, "wb") as f:
                    f.write(message.attachment.data)
                    os.utime(file_name, times=(t, t))

        self.last_checked = datetime.datetime.now()