def Process(self, **kwargs): print("-[Plugin]: DOCX Plugin Started.") super(DOCXPlugin, self).Process(**kwargs) data = MappingDocuments() ooxml = OOXML(kwargs['fp']) ooxml.parse_ooxml(data.ole_path, 'docx') try: data.content = ooxml.content data.title = ooxml.metadata['Title'] data.subject = ooxml.metadata['Subject'] data.author = ooxml.metadata['Author'] data.tags = ooxml.metadata['Tags'] data.explanation = None data.lastsavedby = ooxml.metadata['LastSavedBy'] data.version = ooxml.metadata['Version'] data.date = None data.lastprintedtime = ooxml.metadata['LastPrintedTime'] data.createdtime = ooxml.metadata['CreatedTime'] data.lastsavedtime = ooxml.metadata['LastSavedTime'] data.comment = ooxml.metadata['Comment'] data.revisionnumber = ooxml.metadata['RevisionNumber'] data.category = ooxml.metadata['Category'] data.manager = ooxml.metadata['Manager'] data.company = ooxml.metadata['Company'] data.programname = ooxml.metadata['ProgramName'] data.totaltime = ooxml.metadata['TotalTime'] data.creator = None data.trapped = None # data.creation_time = ooxml.metadata['created'] # data.last_written_time = ooxml.metadata['modified'] data.has_metadata = ooxml.has_metadata data.has_content = ooxml.has_content data.is_damaged = ooxml.is_damaged #print(f"{data.__dict__['name']}") #es.index(index=index_name, doc_type=type_name, body=data.__dict__) # print(data.__dict__) return data except Exception as ex: print('[Error]%s-%s' % (ex, data.name)) return None
def documentFilter(self, data, file): config = configparser.ConfigParser() conf_file = os.path.dirname( os.path.dirname(os.path.abspath(__file__))) + os.sep + 'config' + os.sep + 'carpe.conf' if not os.path.exists(conf_file): raise Exception('%s file does not exist.\n' % conf_file) config.read(conf_file) _host = config.get('elasticsearch', 'host') _port = config.getint('elasticsearch', 'port') _es_id = config.get('elasticsearch', 'id') _es_passwd = config.get('elasticsearch', 'passwd') es = Elasticsearch(hosts=_host, port=_port, http_auth=(_es_id, _es_passwd)) index_name = 'documents' type_name = 'document' data.name = file[4] # 파일이름 data.ext = file[34] # 파일 확장자 data.parent_full_path = file[33] # parent 경로 data.path_with_ext = file[33] + file[4] # 파일 전체 경로 data.full_path = data.path_with_ext.replace('.'+data.ext, '') # 확장자 제외 #data.full_path = data.case_id + '/' + data.evdnc_id + '/documents/' + data.name #data.full_path = 'test' file_export_path = data.work_dir+file[4] data.creation_time = datetime.fromtimestamp(file[13]) data.last_written_time = datetime.fromtimestamp(file[11]) data.last_access_time = datetime.fromtimestamp(file[12]) data.original_size = file[10] data.has_exif = 'No' data.doc_id = file[0] data.doc_type = 'documents' data.doc_type_sub = data.ext #work_file = file[37] #data.download_path = file[38] data.download_path = data.case_id + '/' + data.evdnc_id + '/documents/' + data.name data.ole_path = file[38] + "_extracted" #print(data.download_path) if data.ext.lower() in 'pdf': #return False # pdf 주석 """ with PDF(file_export_path) as pdf: pdf.parse_content() pdf.parse_metadata() pdf.extract_multimedia(data.ole_path) #try: data.content = pdf.content #print(type(pdf.metadata[0]['Title'])) if type(pdf.metadata[0]['Title']) == bytes: try: data.title = pdf.metadata[0]['Title'].decode('UTF-16') except UnicodeDecodeError: data.title = pdf.metadata[0]['Title'].decode('UTF-8') else: data.title = pdf.metadata[0]['Title'] #print(type(pdf.metadata[0]['Subject'])) if type(pdf.metadata[0]['Subject']) == bytes: try: data.subject = pdf.metadata[0]['Subject'].decode('UTF-16') except UnicodeDecodeError: data.subject = pdf.metadata[0]['Subject'].decode('UTF-8') else: data.subject = pdf.metadata[0]['Subject'] #print(type(pdf.metadata[0]['Author'])) if type(pdf.metadata[0]['Author']) == bytes: try: data.author = pdf.metadata[0]['Author'].decode('UTF-16') except UnicodeDecodeError: data.author = pdf.metadata[0]['Author'].decode('UTF-8') else: data.author = pdf.metadata[0]['Author'] if type(pdf.metadata[0]['Tags']) == bytes: try: data.tags = pdf.metadata[0]['Tags'].decode('UTF-16') except UnicodeDecodeError: data.tags = pdf.metadata[0]['Tags'].decode('UTF-8') else: data.tags = pdf.metadata[0]['Tags'] #data.tags = pdf.metadata[0]['Tags'] data.explanation = None data.lastsavedby = None data.version = None data.date = None data.lastprintedtime = None data.createdtime = pdf.metadata[0]['CreatedTime'].decode('UTF-8') data.lastsavedtime = pdf.metadata[0]['LastSavedTime'].decode('UTF-8') data.comment = None data.revisionnumber = None data.category = None data.manager = None data.company = None #print(type(pdf.metadata[0]['Author'])) if type(pdf.metadata[0]['ProgramName']) == bytes: try: data.programname = pdf.metadata[0]['ProgramName'].decode('UTF-16') except UnicodeDecodeError: data.programname = pdf.metadata[0]['ProgramName'].decode('UTF-8') else: data.programname = pdf.metadata[0]['ProgramName'] #data.programname = pdf.metadata[0]['ProgramName'].decode('UTF-8') data.totaltime = None #print(type(pdf.metadata[0]['Author'])) if type(pdf.metadata[0]['Creator']) == bytes: try: data.creator = pdf.metadata[0]['Creator'].decode('UTF-16') except UnicodeDecodeError: data.creator = pdf.metadata[0]['Creator'].decode('UTF-8') else: data.creator = pdf.metadata[0]['Creator'] #data.creator = pdf.metadata[0]['Creator'].decode('UTF-8') data.trapped = pdf.metadata[0]['Trapped'] #data.creation_time = pdf.metadata[0]['CreationDate'].decode('utf-8') #data.last_written_time = pdf.metadata[0]['ModDate'].decode('utf-8') data.has_metadata = pdf.has_metadata data.has_content = pdf.has_content data.is_damaged = pdf.is_damaged #print(f"{data.__dict__['name']}") es.index(index=index_name, doc_type=type_name, body=data.__dict__) #print(data.__dict__) return True #except Exception as ex: # print('[Error]%s-%s'%(ex, data.name)) # return False """ elif data.ext.lower() in 'hwp': hwp = HWP(file_export_path) hwp.parse(data.ole_path) try: data.content = hwp.content data.title = hwp.metaList['Title'] data.subject = hwp.metaList['Subject'] data.author = hwp.metaList['Author'] data.tags = hwp.metaList['Tags'] data.explanation = hwp.metaList['Explanation'] data.lastsavedby = hwp.metaList['LastSavedBy'] data.version = hwp.metaList['Version'] data.date = hwp.metaList['Date'] data.lastprintedtime = hwp.metaList['LastPrintedTime'] data.createdtime = hwp.metaList['CreatedTime'] data.lastsavedtime = hwp.metaList['LastSavedTime'] data.comment = hwp.metaList['Comment'] data.revisionnumber = hwp.metaList['RevisionNumber'] data.category = hwp.metaList['Category'] data.manager = hwp.metaList['Manager'] data.company = hwp.metaList['Company'] data.programname = hwp.metaList['ProgramName'] data.totaltime = hwp.metaList['TotalTime'] data.creator = hwp.metaList['Creator'] data.trapped = hwp.metaList['Trapped'] #data.creation_time = hwp.metaList[0]['createTime'] #data.last_written_time = hwp.metaList[0]['lastSavedTime'] data.has_metadata = hwp.has_metadata data.has_content = hwp.has_content data.is_damaged = hwp.isDamaged print(f"{data.__dict__['name']}") es.index(index=index_name, doc_type=type_name, body=data.__dict__) #print(data.__dict__) return True except Exception as ex: #print(hwp.metaList) print('[Error]%s-%s'%(ex, data.name)) return False elif data.ext.lower() in ('doc', 'xls', 'ppt'): compound = Compound(file_export_path) compound.parse(data.ole_path) try: data.content = compound.content data.title = compound.metadata['Title'] data.subject = compound.metadata['Subject'] data.author = compound.metadata['Author'] data.tags = compound.metadata['Tags'] data.explanation = compound.metadata['Explanation'] data.lastsavedby = compound.metadata['LastSavedBy'] data.version = compound.metadata['Version'] data.date = compound.metadata['Date'] data.lastprintedtime = compound.metadata['LastPrintedTime'] data.createdtime = compound.metadata['CreatedTime'] data.lastsavedtime = compound.metadata['LastSavedTime'] data.comment = compound.metadata['Comment'] data.revisionnumber = compound.metadata['RevisionNumber'] data.category = compound.metadata['Category'] data.manager = compound.metadata['Manager'] data.company = compound.metadata['Company'] data.programname = compound.metadata['ProgramName'] data.totaltime = compound.metadata['TotalTime'] data.creator = compound.metadata['Creator'] data.trapped = compound.metadata['Trapped'] #data.creation_time = compound.metadata['create_time'] #data.last_written_time = compound.metadata['modified_time'] data.has_metadata = compound.has_metadata data.has_content = compound.has_content data.is_damaged = compound.is_damaged print(f"{data.__dict__['name']}") es.index(index=index_name, doc_type=type_name, body=data.__dict__) #print(data.__dict__) return True except Exception as ex: print('[Error]%s-%s'%(ex, data.name)) return False elif data.ext.lower() in ('docx', 'xlsx', 'pptx'): ooxml = OOXML(file_export_path) ooxml.parse_ooxml(data.ole_path) try: data.content = ooxml.content data.title = ooxml.metadata['Title'] data.subject = ooxml.metadata['Subject'] data.author = ooxml.metadata['Author'] data.tags = ooxml.metadata['Tags'] data.explanation = None data.lastsavedby = ooxml.metadata['LastSavedBy'] data.version = ooxml.metadata['Version'] data.date = None data.lastprintedtime = ooxml.metadata['LastPrintedTime'] data.createdtime = ooxml.metadata['CreatedTime'] data.lastsavedtime = ooxml.metadata['LastSavedTime'] data.comment = ooxml.metadata['Comment'] data.revisionnumber = ooxml.metadata['RevisionNumber'] data.category = ooxml.metadata['Category'] data.manager = ooxml.metadata['Manager'] data.company = ooxml.metadata['Company'] data.programname = ooxml.metadata['ProgramName'] data.totaltime = ooxml.metadata['TotalTime'] data.creator = None data.trapped = None #data.creation_time = ooxml.metadata['created'] #data.last_written_time = ooxml.metadata['modified'] data.has_metadata = ooxml.has_metadata data.has_content = ooxml.has_content data.is_damaged = ooxml.is_damaged print(f"{data.__dict__['name']}") es.index(index=index_name, doc_type=type_name, body=data.__dict__) #print(data.__dict__) return True except Exception as ex: print('[Error]%s-%s'%(ex, data.name)) return False else: pass print('this file format not supported!')