Ejemplo n.º 1
0
    def Parse(self, FileName, FileData):
        resp = FileParserResponse()

        try:
            meta = self.Metadata()
            if FileName and FileName != '':
                meta.set(self.Metadata.RESOURCE_NAME_KEY, FileName)
            contentHandler = self.BodyContentHandler(-1)
            inputStream = self.ByteArrayInputStream(FileData)
            self.parser.parse(inputStream, contentHandler, meta)

            try:
                resp.text = contentHandler.toString()
            except Exception as convEx:
                resp.text = BinaryStringParser.Parse(convEx.object)

            for name in meta.names():
                try:
                    resp.meta[name] = meta.get(name)
                except:
                    resp.meta[name] = ''

            inputStream = None
            contentHandler = None

            if 'Content-Type' in resp.meta and ContentTypeAnalyzer.IsImageByContentType(
                    resp.meta['Content-Type']):
                self.logger.LogMessage(
                    'info', 'performing ocr on {0}'.format(FileName))
                ocrResp = self.ocrProxy.PerformOCR(FileData)

                if ocrResp.success:
                    resp.text = self.NormalizeText('{0}{1}'.format(
                        resp.text, ocrResp.text))
                    resp.ocrPerformed = True

                if not ocrResp.success:
                    self.logger.LogMessage(
                        'info', 'could not perform ocr on {0} {1}'.format(
                            FileName, ocrResp.message))

                resp.thumbnail = self.GenerateThumbnail(FileData)

            resp.success = True
        except Exception as ex:
            resp.success = False
            resp.message = str(ex)

        return resp
Ejemplo n.º 2
0
    def Parse(self, FileName, FileData):
        if ContentTypeAnalyzer.IsPdf(FileName):
            return self.pdfParser.Parse(FileName, FileData)

        return self.tikaParser.Parse(FileName, FileData)
Ejemplo n.º 3
0
 def SetImageTag(self, AmbarFile):
     if ContentTypeAnalyzer.IsImageByContentType(
             AmbarFile['content']['type']):
         self.AddTagToAmbarFile(AmbarFile['file_id'],
                                AmbarFile['meta']['full_name'],
                                self.AUTO_TAG_TYPE, 'image')
Ejemplo n.º 4
0
 def SetArchiveTag(self, AmbarFile):
     if ContentTypeAnalyzer.IsArchive(AmbarFile['meta']['full_name']):
         self.AddTagToAmbarFile(AmbarFile['file_id'],
                                AmbarFile['meta']['full_name'],
                                self.AUTO_TAG_TYPE, 'archive')
Ejemplo n.º 5
0
 def SetVideoTag(self, AmbarFile):
     if ContentTypeAnalyzer.IsVideoByContentType(
             AmbarFile['content']['type']):
         self.AddTagToAmbarFile(AmbarFile['file_id'],
                                AmbarFile['meta']['full_name'],
                                self.AUTO_TAG_TYPE, 'video')
Ejemplo n.º 6
0
 def SetTextTag(self, AmbarFile):
     if ContentTypeAnalyzer.IsTextByContentType(
             AmbarFile['content']['type']):
         self.AddTagToAmbarFile(AmbarFile['file_id'],
                                AmbarFile['meta']['full_name'],
                                self.AUTO_TAG_TYPE, 'text')
Ejemplo n.º 7
0
def ProcessFile(message):
    try:
        meta = message['meta']
        event = message['event']
        sha = None

        logger.LogMessage('verbose', '{0} task received for {1}'.format(event, meta['full_name']))

        if ('sha' in message):
            sha = message['sha']

        fileId = sha256('{0}{1}'.format(meta['source_id'],meta['full_name']).encode('utf-8')).hexdigest()

        if (event == 'unlink'):
            apiResp = apiProxy.HideFile(fileId)

            if not apiResp.Success:
                logger.LogMessage('error', 'error hidding file for {0} {1}'.format(meta['full_name'], apiResp.message))
                return False
            
            if apiResp.Ok:
                logger.LogMessage('verbose', 'removed {0}'.format(meta['full_name']))
                return True

            if not apiResp.NotFound:
                logger.LogMessage('error', 'error hidding file {0} {1} code: {2}'.format(meta['full_name'], apiResp.message, apiResp.code))
                return False
            
            return True

        if (event != 'add' and event != 'change'):
            print('Ignoring {0}'.format(event))
            return True

        apiResp = apiProxy.CheckIfMetaExists(meta)

        if not apiResp.Success:
            logger.LogMessage('error', 'error checking meta existance for {0} {1}'.format(meta['full_name'], apiResp.message))
            return False

        if apiResp.Ok:
            logger.LogMessage('verbose', 'meta found for {0}'.format(meta['full_name']))
            return True

        if not apiResp.NotFound:
            logger.LogMessage('error', 'error checking meta existance for {0} {1} {2}'.format(meta['full_name'], apiResp.code, apiResp.message))
            return False

        apiResp = apiProxy.UnhideFile(fileId)

        if not apiResp.Success:
            logger.LogMessage('error', 'error unhiding file {0} {1}'.format(meta['full_name'], apiResp.message))
            return False

        if not (apiResp.Ok or apiResp.NotFound):
            logger.LogMessage('error', 'error unhiding file, unexpected response code {0} {1} {2}'.format(fileMeta.full_name, apiResp.code, apiResp.message))
            return False
        
        fileMeta = AmbarFileMeta.Init(meta)

        if not fileMeta.initialized:
            logger.LogMessage('error', 'error initializing file meta {0}'.format(fileMeta.message))
            return False

        if (sha):
            apiResp = apiProxy.DownloadFileBySha(sha)
        else:
            apiResp = apiProxy.DownloadFile(fileMeta.full_name)

        if not apiResp.Success:
            logger.LogMessage('error', 'error downloading file {0} {1}'.format(fileMeta.full_name, apiResp.message))
            return False

        if not apiResp.Ok:
            logger.LogMessage('error', 'error downloading file {0} {1} code: {2}'.format(fileMeta.full_name, apiResp.message, apiResp.code))
            return False

        fileData = apiResp.payload

        sha = sha256(fileData).hexdigest()

        hasParsedContent = False
        fileContent = {}

        apiResp = apiProxy.GetParsedFileContentFields(sha)

        if not apiResp.Success:
            logger.LogMessage('error', 'error retrieving parsed file content fields {0} {1}'.format(
                fileMeta.full_name, apiResp.message))
            return False

        if not (apiResp.Ok or apiResp.NotFound):
            logger.LogMessage('error', 'error retrieving parsed file content fields {0} {1} {2}'.format(
                fileMeta.full_name, apiResp.code, apiResp.message))
            return False

        if apiResp.Ok:
            hasParsedContent = True
            fileContent = apiResp.payload

        if hasParsedContent:
            apiResp = apiProxy.GetParsedFileContent(sha)

            if not apiResp.Success:
                logger.LogMessage('error', 'error retrieving parsed file content {0} {1}'.format(
                    fileMeta.full_name, apiResp.message))
                return False

            if not (apiResp.Ok or apiResp.NotFound):
                logger.LogMessage('error', 'error retrieving parsed file content {0} {1} {2}'.format(
                    fileMeta.full_name, apiResp.code, apiResp.message))
                return False

            if apiResp.NotFound:
                hasParsedContent = False

            if apiResp.Ok:
                hasParsedContent = True
                fileContent['text'] = apiResp.payload.decode('utf-8', 'ignore')
                logger.LogMessage(
                    'verbose', 'parsed content found {0}'.format(fileMeta.full_name))

        if not hasParsedContent:
            # checking if file is archive
            if ContentTypeAnalyzer.IsArchive(fileMeta.short_name):
                archiveProcessor.Process(fileData, fileMeta, fileMeta.source_id)

            # checking if file is pst
            if ContentTypeAnalyzer.IsPst(fileMeta.short_name):
                pstProcessor.Process(fileData, fileMeta, fileMeta.source_id)

            # extracting
            logger.LogMessage('verbose', 'parsing {0}'.format(fileMeta.full_name))
            fileParserResp = fileParser.Parse(fileMeta.short_name, fileData)

            if not fileParserResp.success:
                logger.LogMessage('error', 'error parsing {0} {1}'.format(
                    fileMeta.full_name, fileParserResp.message))
                return False

            logger.LogMessage(
                'verbose', 'successfully parsed {0}'.format(fileMeta.full_name))

            # building Ambar File Content
            fileContent = AmbarFileContent.Init(fileParserResp, sys.getsizeof(fileData))

            # submitting thumbnail
            if fileParserResp.thumbnail:
                logger.LogMessage(
                    'verbose', 'submitting thumbnail {0}'.format(fileMeta.full_name))
                apiResp = apiProxy.SubmitThumbnail(
                    sha, fileParserResp.thumbnail[0])

                if not apiResp.Success:
                    logger.LogMessage('error', 'error submitting thumbnail to Api {0} {1}'.format(
                        fileMeta.full_name, apiResp.message))
                    return False

                if not apiResp.Ok:
                    logger.LogMessage('error', 'error submitting thumbnail to Api, unexpected response code {0} {1} {2}'.format(
                        fileMeta.full_name, apiResp.code, apiResp.message))
                    return False

                fileContent.thumb_available = True
                logger.LogMessage('verbose', 'thumbnail submited {0}'.format(fileMeta.full_name))

            # submitting parsed text to Api
            if not  ContentTypeAnalyzer.IsArchive(fileMeta.short_name) and not ContentTypeAnalyzer.IsPst(fileMeta.short_name):
                logger.LogMessage('verbose', 'submitting parsed text {0}'.format(fileMeta.full_name))

                apiResp = apiProxy.SubmitExtractedContent(
                    sha, fileContent.text.encode(encoding='utf_8', errors='ignore'))

                if not apiResp.Success:
                    logger.LogMessage('error', 'error submitting parsed text to Api {0} {1}'.format(fileMeta.full_name, apiResp.message))
                    return False

                if not apiResp.Ok:
                    logger.LogMessage('error', 'error submitting parsed text to Api, unexpected response code {0} {1} {2}'.format(fileMeta.full_name, apiResp.code, apiResp.message))
                    return False

                logger.LogMessage('verbose', 'parsed text submited {0}'.format(fileMeta.full_name))

        # submitting processed file to Api
        logger.LogMessage('verbose', 'submitting parsed content {0}'.format(fileMeta.full_name))

        ambarFile = {}
        ambarFile['content'] = fileContent.Dict if isinstance(fileContent, AmbarFileContent) else fileContent
        ambarFile['meta'] = fileMeta.Dict
        ambarFile['sha256'] = sha
        ambarFile['file_id'] = fileId
        ambarFile['indexed_datetime'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]

        apiResp = apiProxy.SubmitProcessedFile(fileId, json.dumps(dict(ambarFile)).encode(encoding='utf_8', errors='ignore'))

        if not apiResp.Success:
            logger.LogMessage('error', 'error submitting parsed content to Api {0} {1}'.format(
                fileMeta.full_name, apiResp.message))
            return False

        if not (apiResp.Ok or apiResp.Created):
            logger.LogMessage('error', 'error submitting parsed content to Api, unexpected response code {0} {1} {2}'.format(
                fileMeta.full_name, apiResp.code, apiResp.message))
            return False

        logger.LogMessage('verbose', 'parsed content submited {0}'.format(fileMeta.full_name))

        apiResp = apiProxy.AddMetaIdToCache(fileMeta.id)

        if not apiResp.Success:
            logger.LogMessage('error', 'error adding meta id to cache {0} {1}'.format(fileMeta.full_name, apiResp.message))
            return False

        if not apiResp.Ok:
            logger.LogMessage('error', 'error adding meta id to cache, unexpected response code {0} {1} {2}'.format(fileMeta.full_name, apiResp.code, apiResp.message))
            return False

        # removing original file
        if not preserveOriginals:
            apiResp = apiProxy.RemoveFileContent(sha)

            if not apiResp.Success:
                logger.LogMessage('error', 'error removing original file from Ambar for {0} {1}'.format(fileMeta.full_name, apiResp.message))
                return False

            if not (apiResp.Ok or apiResp.NotFound):
                logger.LogMessage('error', 'error removing original file from Ambar for {0}, unexpected response code {1} {2}'.format(fileMeta.full_name, apiResp.code, apiResp.message))
                return False

            if apiResp.Ok:
                logger.LogMessage(
                    'verbose', 'original file removed from Ambar for {0}'.format(fileMeta.full_name))

        ## tags
        apiResp = apiProxy.RemoveAutoTags(fileId)
        if not apiResp.Success:
            logger.LogMessage('error', 'error removing autotags {0} {1}'.format(fileMeta.full_name, apiResp.message))
            return False

        if not apiResp.Ok:
            logger.LogMessage('error', 'error removing autotags, unexpected response code {0} {1} {2}'.format(fileMeta.full_name, apiResp.code, apiResp.message))
            return False

        autoTagger.AutoTagAmbarFile(ambarFile)

        return True
    except Exception as e:
        logger.LogMessage('error', 'error processing task {0}'.format(repr(e)))
        return False