Esempio n. 1
0
def tika(conf, attachments):
    """This method updates the attachments results
    with the Tika reports.

    Args:
        attachments (list): all attachments of email
        conf (dict): conf of this post processor

    Returns:
        This method updates the attachments list given
    """

    if conf["enabled"]:
        from tikapp import TikaApp
        tika = TikaApp(file_jar=conf["path_jar"],
                       memory_allocation=conf["memory_allocation"])

        for a in attachments:
            if not a.get("is_filtered", False):

                if a["Content-Type"] in conf["whitelist_cont_types"]:
                    payload = a["payload"]

                    if a["content_transfer_encoding"] != "base64":
                        payload = payload.encode("base64")

                    # tika-app only gets payload in base64
                    a["tika"] = tika.extract_all_content(payload=payload,
                                                         convert_to_obj=True)
Esempio n. 2
0
def getTextFile(nameOfPDF, jarPath):
    '''
		getTextFile("nameOfPDF", path) will take the PDF and 
		output it as a textfile. Path will be taken and used to
		specify the .jar file needed for tika.
		Note: Needs the tika-app-1.22.jar and tika-app-python files
		and folders in the same working directory
	'''
    # get the Tika Object from current directory
    tika_client = TikaApp(file_jar=join(jarPath, "tika-app-1.22.jar"))

    # read the pdf
    with open(nameOfPDF) as fin:

        # get rid of the .pdf to change to .txt for later
        if nameOfPDF[-4:] == ".pdf":
            foutName = nameOfPDF[:-4]
        else:
            # if its not .pdf, then just keep the filename
            foutName = nameOfPDF

        content = tika_client.extract_all_content(objectInput=fin)
        # write the pdf to a text file & add .txt to it
        with open(foutName + ".txt", "w", encoding='utf-8',
                  errors='replace') as fout:
            fout.write(content)
            return foutName + ".txt"
Esempio n. 3
0
def run(message):
    tika_client = TikaApp(file_jar="./tika-app/tika-app-1.21.jar")
    tika_result = tika_client.extract_only_content(message["path"])

    if (tika_result != None):
        processing_dir = "./data/processing/"
        identifier = str(uuid.uuid4())
        workfile = processing_dir + identifier
        with open(workfile, 'wb') as f:
            f.write(tika_result.encode('UTF-8'))

        new_message = {
            "identifier": identifier,
            "parent": message["identifier"],
            "path": workfile,
            "filename": "pdf.txt",
            "filetype": "unknown",
            "history": [],
            "metadata": {},
            "original_file": False
        }
        sendEvent(new_message)
    extract_images_pymupdf(message)

    sendEvent(message)
Esempio n. 4
0
class pdf_parser:
    def __init__(self, tika_jar_path):
        self.tika_client = TikaApp(file_jar=tika_jar_path)

    def parse(self, doc_path, file=False):
        if file:
            encoded = base64.b64encode(doc_path)
            content = json.loads(
                self.tika_client.extract_all_content(payload=encoded))
        else:
            content = json.loads(
                self.tika_client.extract_all_content(path=doc_path))
        content_string = re.sub(r'\n(?![\n])', r'',
                                content[0]['X-TIKA:content'])
        content_string = re.sub(r'(\n)(\n+)', r'\1', content_string)

        date_string = content[0].get('Last-Modified') \
            or content[0].get('Last-Save-Date') \
            or content[0].get('Creation-Date') \
            or datetime.now().strftime('%Y-%m-%dT%H:%M:%SZ')

        if date_string:
            date_string = str(dateutil.parser.parse(date_string).date())

        df = pd.DataFrame(columns=['date', 'content'])

        df.loc[0] = [date_string, content_string]

        return df
Esempio n. 5
0
    def __init__(self,
                 index,
                 es_host="localhost:9200",
                 file_jar='/Users/laofeng/es_home/tika-app-1.23.jar',
                 index_content=False,
                 create_index=True,
                 force_renew_index=False,
                 schema_file=None):

        self.tika_app = TikaApp(file_jar)
        self.es = Elasticsearch(es_host)
        self.index = index
        self.index_content = index_content

        #查询索引是否存在
        index_exist = self.es.indices.exists(index)

        #if not index_exist and not create_index:

        if not index_exist and create_index and schema_file:
            with open(schema_file, 'r', encoding='utf-8') as f:
                schema = json.load(f)
                self.es.indices.create(index, schema)

        #如果已经存在,且强制renew,先删除后,再建立
        if index_exist and force_renew_index:
            self.es.indices.delete(index)
            with open(schema_file, 'r', encoding='utf-8') as f:
                schema = json.load(f)
                self.es.indices.create(index, schema)
Esempio n. 6
0
    def __init__(self,
                 jar=None,
                 memory_allocation=None,
                 valid_content_types=set()):

        # Init Tika
        self._tika_client = TikaApp(file_jar=jar,
                                    memory_allocation=memory_allocation)
        self._jar = jar
        self._memory_allocation = memory_allocation
        self._valid_content_types = valid_content_types
Esempio n. 7
0
class TikaReader(object):
    def __init__(self, path):
        self.tika_client = TikaApp(file_jar=path)

    def detect_type(self, doc):
        return self.tika_client.detect_content_type(doc)

    def detect_language(self, doc):
        return self.tika_client.detect_language(doc)

    def content(self, doc):
        return self.tika_client.extract_all_content(doc)
 def handle_backup_file(self,context, savepkt):
     DebugMessage(context, 100, "handle_backup_file called with " + str(savepkt) + "\n");
     DebugMessage(context, 100, "fname: " + savepkt.fname + " Type: " + str(savepkt.type) + "\n");
     if ( savepkt.type == bFileType['FT_REG'] ):
         DebugMessage(context, 100, "regulaer file, do something now...\n");
         # configure your Elasticsearch server here:
         es = Elasticsearch([{'host': '192.168.17.2', 'port': 9200}])
         # configure your TikaApp jar file here:
         try:
             tika_client = TikaApp(file_jar="/usr/local/bin/tika-app-1.20.jar")
         except Exception as ex:
             JobMessage(context,  bJobMessageType['M_ERROR'], 'Error indexing %s. Tika error: %s' % (savepkt.fname, str(ex)))
             return bRCs['bRC_OK'];
         # tika_client has several parser options
         # Next one is for metadata only:
         #result_payload=tika_client.extract_only_metadata(savepkt.fname)
         # This one includes file contents as text:
         try:
             result_payload=tika_client.extract_all_content(savepkt.fname)
         except Exception as ex:
             JobMessage(context,  bJobMessageType['M_ERROR'], 'Error extracting contents from %s. Tika error: %s' % (savepkt.fname, str(ex)))
             return bRCs['bRC_OK'];
         # result_payload is a list of json-strings. Nested structes like
         # tar-files or emails with attachments or inline documents are
         # returned as distinct json string.
         # The first string [0] contains information for the main file
         # TODO: care about nested structures, for now we only takte the first/main file 
         try:
             data = json.loads(result_payload)[0]
         except Exception as ex:
             JobMessage(context,  bJobMessageType['M_ERROR'], 
                 'Error reading json fields delivered by Tika examining file %s. Json error: %s' % (savepkt.fname, str(ex)))
             return bRCs['bRC_OK'];
         # Tika eventually adds "Unkonwn Tags (id)", with id as increasing number, which
         # could lead to exceed the keyword limit in elasticsearch indices, we
         # remove those tags
         for data_keyword in data.keys():
             if data_keyword.startswith ("Unknown tag ("):
                 del data[data_keyword]
         # Tika adds some emptylines at the beginning of content, we strip it here
         if 'X-TIKA:content' in data:
             data['X-TIKA:content'] = data['X-TIKA:content'].strip()
         data['bareos_jobId'] = self.jobId
         data['bareos_fdname'] = self.fdname
         data['bareos_joblevel'] = unichr(self.level)
         data['bareos_directory'] = os.path.dirname(savepkt.fname) 
         try:
             esRes = es.index (index="bareos-test", doc_type='_doc', body=data)
         except Exception as ex:
             JobMessage(context,  bJobMessageType['M_ERROR'], 'Error indexing %s. Elastic error: %s' % (savepkt.fname, str(ex)))
     return bRCs['bRC_OK'];
Esempio n. 9
0
def run(message):
    # Text
    tika_client = TikaApp(file_jar="./tika-app/tika-app-1.21.jar")
    tika_result = tika_client.extract_only_content(message["path"])

    if (tika_result != None):
        processing_dir = "./data/processing/"
        identifier = str(uuid.uuid4())
        workfile = processing_dir + identifier
        with open(workfile, 'wb') as f:
            f.write(tika_result.encode('UTF-8'))

        new_message = {
                "identifier": identifier,
                "parent": message["identifier"],
                "path": workfile,
                "filename" : "doc.txt",
                "filetype": "unknown",
                "history": [],
                "metadata": {},
                "original_file": False
            }
        sendEvent(new_message)

    # Images
    with ZipFile(message["path"], 'r') as zipObj:
        processng_dir = "./data/processing"
        tmp_identifier = os.path.join(processng_dir, str(uuid.uuid4()))
        zipObj.extractall(tmp_identifier)
    
        for root, dirs, files in os.walk(tmp_identifier):
            for filename in files:
                if ".png" in filename or ".jpeg" in filename or ".jpg" in filename:
                    new_identifier = str(uuid.uuid4())
                    processing_dest = os.path.join(processng_dir, new_identifier)
                    new_message = {
                        "identifier": new_identifier,
                        "parent": message["identifier"],
                        "path": processing_dest,
                        "filename" : filename,
                        "filetype": "unknown",
                        "history": [],
                        "metadata": {},
                        "original_file": False
                    }
                    shutil.move(os.path.join(root, filename), processing_dest)
                    sendEvent(new_message)
        shutil.rmtree(tmp_identifier)

    sendEvent(message)
Esempio n. 10
0
def tika(conf, attachments):
    """This method updates the attachments results
    with the Tika reports.

    Args:
        attachments (list): all attachments of email
        conf (dict): conf of this post processor

    Returns:
        This method updates the attachments list given
    """

    if conf["enabled"]:
        from tikapp import TikaApp

        tika = TikaApp(file_jar=conf["path_jar"],
                       memory_allocation=conf["memory_allocation"])

        wtlist = conf.get("whitelist_content_types", [])
        if not wtlist:
            log.warning(
                "Apache Tika analysis setted, without whitelist content types")
            return

        for a in attachments:
            if not a.get("is_filtered", False):
                if a["Content-Type"] in wtlist:
                    payload = a["payload"]

                    if a["content_transfer_encoding"] != "base64":
                        try:
                            payload = payload.encode("base64")
                        except UnicodeError:
                            # content_transfer_encoding': u'x-uuencode'
                            # it's not binary with strange encoding
                            continue

                    # tika-app only gets payload in base64
                    try:
                        results = tika.extract_all_content(payload=payload,
                                                           convert_to_obj=True)
                        if results:
                            a["tika"] = results
                    except JSONDecodeError:
                        log.warning(
                            "JSONDecodeError for {!r} in Tika analysis".format(
                                a["md5"]))
Esempio n. 11
0
def main():
    args = get_args()

    tika = TikaApp(args.jar or os.environ.get("TIKA_APP_JAR", None))

    parameters = {
        "path": args.file,
        "payload": args.payload,
        "objectInput": sys.stdin if args.stdin else None
    }

    try:
        if args.detect:
            print(tika.detect_content_type(**parameters))

        if args.text:
            print(tika.extract_only_content(**parameters))

        if args.language:
            print(tika.detect_language(**parameters))

        if args.all:
            parameters["pretty_print"] = True
            print(tika.extract_all_content(**parameters))

        if args.metadata:
            parameters["pretty_print"] = True
            print(tika.extract_only_metadata(**parameters))

    except IOError:
        pass
Esempio n. 12
0
def main():
    args = get_args()

    command_line = dict()
    if args.jar:
        command_line = {"TIKA_APP_JAR": args.jar}

    defaults = {"TIKA_APP_JAR": "/opt/tika/tika-app-1.15.jar"}
    options = ChainMap(command_line, os.environ, defaults)

    tika = TikaApp(options['TIKA_APP_JAR'])

    try:
        if args.file:
            f = args.file

            if args.detect:
                print(tika.detect_content_type(path=f))

            if args.text:
                print(tika.extract_only_content(path=f))

            if args.language:
                print(tika.detect_language(path=f))

            if args.all:
                print(tika.extract_all_content(path=f, pretty_print=True))

        elif args.payload:
            p = args.payload

            if args.detect:
                print(tika.detect_content_type(payload=p))

            if args.text:
                print(tika.extract_only_content(payload=p))

            if args.language:
                print(tika.detect_language(payload=p))

            if args.all:
                print(tika.extract_all_content(payload=p, pretty_print=True))

    except IOError:
        pass
Esempio n. 13
0
class TikaAnalysis(object):
    def __init__(self,
                 jar=None,
                 memory_allocation=None,
                 valid_content_types=set()):

        # Init Tika
        self._tika_client = TikaApp(file_jar=jar,
                                    memory_allocation=memory_allocation)
        self._jar = jar
        self._memory_allocation = memory_allocation
        self._valid_content_types = valid_content_types

    @property
    def jar(self):
        return self._jar

    @jar.setter
    def jar(self, value):
        self._jar = value

    @property
    def memory_allocation(self):
        return self._memory_allocation

    @memory_allocation.setter
    def memory_allocation(self, value):
        self._memory_allocation = value

    @property
    def valid_content_types(self):
        return self._valid_content_types

    @valid_content_types.setter
    def valid_content_types(self, value):
        if not isinstance(value, set):
            raise InvalidContentTypes("Content types must be a set")
        self._valid_content_types = value

    def add_meta_data(self, attachment):
        """If content_type in valid_content_types this method
        extracts meta data and update attachments input results.
        """

        if not isinstance(attachment, dict):
            raise InvalidAttachment("Attachment result is not a dict")

        # The Apache Tika output of archive contains the contents and metadata
        # of all archived files.
        if attachment['Content-Type'] in self.valid_content_types:
            attachment['tika'] = self._tika_client.extract_all_content(
                payload=attachment['payload'], convert_to_obj=True)
class TikaReader:
    # Iniciador de la clase.
    def __init__(self, file_process):
        # Cliente Tika que utiliza que carga el fichero jar cliente.
        self.tika_client = TikaApp(file_jar="tika-app-1.20.jar")
        self.file_process = file_process

    # Detector del tipo de contenido MIME.
    def detect_document_type(self):
        return self.tika_client.detect_content_type(self.file_process)

    # Detector de lenguaje utilizado en el documento.
    def detect_language(self):
        return self.tika_client.detect_language(self.file_process)

    # Extractor del contenido completo del documento.
    def extract_complete_info(self, value=False):
        return self.tika_client.extract_all_content(self.file_process,
                                                    convert_to_obj=value)

    # Extractor de solo el contenido del documento.
    def extract_content_info(self):
        return self.tika_client.extract_only_content(self.file_process)
Esempio n. 15
0
class ProcessJSONTika(object):
    def __init__(self, path):
        self.tika_client = TikaApp(file_jar=path)

    def jsonprocessor(self, doc):
        return self.tika_client.extract_all_content(doc,
                                                    convert_to_obj=True)[0]

    def author(self, doc):
        return self.jsonprocessor(doc).get('Author', None)

    def creationdate(self, doc):
        return self.jsonprocessor(doc).get('Creation-Date', None)

    def lastmodified(self, doc):
        return self.jsonprocessor(doc).get('Last-Modified', None)

    def all_content(self, doc):
        return self.jsonprocessor(doc)['X-TIKA:content']

    def top_10_words(self, doc):
        content = self.all_content(doc)
        words = word_tokenize(content)
        # stopwords
        stopWords = set(stopwords.words('english'))
        clean_words = [
            word for word in words if word.isalpha() and word not in stopWords
        ]
        words_dic = {}
        for i in clean_words:
            if i in words_dic.keys():
                words_dic[i] += 1
            else:
                words_dic[i] = 1
        return sorted(words_dic.items(),
                      key=operator.itemgetter(1),
                      reverse=True)[:10]
Esempio n. 16
0
 def __init__(self, tika_jar_path):
     self.tika_client = TikaApp(file_jar=tika_jar_path)
Esempio n. 17
0
from tikapp import TikaApp

tika_client = TikaApp(
    file_jar="/Users/yma2/Documents/_garage/python/cxm/tika/tika-app-1.20.jar")

analyzeFile = "/Users/yma2/Downloads/Azure_Developer_Guide_eBook_ja-JP.pdf"
print(tika_client.detect_content_type(analyzeFile))
print(tika_client.detect_language(analyzeFile))
print(tika_client.extract_only_content(analyzeFile))
print(tika_client.extract_only_metadata(analyzeFile))
    def process(self) -> str:
        """
        在这里提取文档数据的元数据, 将元数据文件存储在self.file_content.work_root_dir下, 固定名称为self.FileName_MetaData, 注意返回的串中有元数据的格式
        注意: 如果出现内存泄漏现象, 则使用新建进程提取元数据, 放置到文件中, 在本进程中解析元数据!!!
        :return:
        """
        default_result = super().process()
        out_metadata_file_fullname = CFile.join_file(
            self.file_content.work_root_dir, self.FileName_MetaData)
        in_file_fullname = self.file_info.file_name_with_full_path

        if not settings.application.xpath_one(
                self.Path_Setting_Dependence_Tika_Enable, True):
            return default_result

        tika_dependence_mode = settings.application.xpath_one(
            self.Path_Setting_Dependence_Tika_Mode, self.Name_Server)
        if CUtils.equal_ignore_case(tika_dependence_mode, self.Name_Server):
            tika_server_url = settings.application.xpath_one(
                self.Path_Setting_Dependence_Tika_Server_Url, None)
            tika_server_connect_timeout = settings.application.xpath_one(
                self.Path_Setting_Dependence_Tika_Server_Timeout, 30)
            if CUtils.equal_ignore_case(tika_server_url, ''):
                return default_result

            try:
                parsed = TikaServer.from_file(
                    in_file_fullname,
                    tika_server_url,
                    requestOptions={'timeout': tika_server_connect_timeout})
                meta_data_dict = parsed["metadata"]
                json_obj = CJson()
                json_obj.load_obj(meta_data_dict)
                json_obj.to_file(out_metadata_file_fullname)
                return CResult.merge_result_info(
                    CResult.merge_result(
                        self.Success,
                        '文档[{0}]的元数据提取成功'.format(in_file_fullname)),
                    self.Name_Format, self.MetaDataFormat_Json)
            except Exception as error:
                return CResult.merge_result(
                    self.Failure, '文档[{0}]的元数据提取过程出现错误, 详细信息为: [{1}]'.format(
                        in_file_fullname, error.__str__()))
        else:
            tika_application = settings.application.xpath_one(
                self.Path_Setting_Dependence_Tika_Client_App, None)
            if CUtils.equal_ignore_case(tika_application, ''):
                return default_result

            if not CFile.file_or_path_exist(tika_application):
                return CResult.merge_result(
                    self.Failure,
                    '文档[{0}]的元数据无法提取, 详细原因为: [依赖中间件{1}文件不存在, 请修正后重试!]'.format(
                        in_file_fullname, tika_application))

            try:
                tika_client = TikaApplication(file_jar=tika_application)
                meta_data_dict = tika_client.extract_only_metadata(
                    in_file_fullname)
                json_obj = CJson()
                json_obj.load_obj(meta_data_dict)
                json_obj.to_file(out_metadata_file_fullname)
                return CResult.merge_result_info(
                    CResult.merge_result(
                        self.Success,
                        '文档[{0}]的元数据提取成功'.format(in_file_fullname)),
                    self.Name_Format, self.MetaDataFormat_Json)
            except Exception as error:
                return CResult.merge_result(
                    self.Failure, '文档[{0}]的元数据提取过程出现错误, 详细信息为: [{1}]'.format(
                        in_file_fullname, error.__str__()))

        # result = raster_mdreader.get_metadata_2_file(out_metadata_file_fullname)
        # result = CProcessUtils.processing_method(raster_mdreader.get_metadata_2_file, out_metadata_file_fullname)
        # 进程调用模式
        # p_one = Process(target=raster_mdreader.get_metadata_2_file, args=(out_metadata_file_fullname,))
        # p_one.start()
        # p_one.join()
        return CResult.merge_result_info(result, self.Name_Format,
                                         self.MetaDataFormat_Json)
Esempio n. 19
0
def analyse_pdf_archive(pdf_csv, keyword_csv, tika_file_jar, outfile_name):
	# PDF for each day saved as welt_mmdd. File names listed in csv file 'welt_pdf'. Create list of PDF names. 
	with open(pdf_csv, 'r', encoding = 'utf-8-sig') as f:
		reader = csv.reader(f)
		pdf_names = list(reader)
	pdf_names = list(itertools.chain(*pdf_names)) # acts as main data frame to contain individual data frames

	pdf_list = []
	for name in pdf_names:
		pdf_list.append(name + '.pdf')

	# Create list of keywords from 'keyword_stems.csv'
	with open(keyword_csv, 'r', encoding = 'utf-8-sig') as f:
		reader = csv.reader(f)
		keywords = list(reader)
	keywords = list(itertools.chain(*keywords))

	tika_client = TikaApp(file_jar=tika_file_jar)

	a = 1 #set counter to 1

	keyword_counter = []
	for pdf in pdf_list:
		rawText = tika_client.extract_only_content(pdf)
		print("pdf {0} extracted".format(a))
		rawList = rawText.split( )
		
		rawList_nopunct = [word.translate(str.maketrans('', '', string.punctuation)) for word in rawList]
		counts = Counter(rawList_nopunct)
		list_words = counts.most_common()

		keyword_hits_list = []
		for x in range(0, len(list_words)):
			temp = list(list_words[x]) # convert from tuple to list
			temp[1] = str(temp[1]) # change number (at index 1) into string
			n_temp = [(unicodedata.normalize('NFKD', word).encode('ASCII', 'ignore')).lower().decode() for word in temp] #normalised umlauts in data
			
			#check word (at index 0) against list of keywords, add new column = 1 if match, = 0 otherwise.
			hits = 0 
			for i in range(0, len(keywords)):
				if keywords[i] in n_temp[0]:
					hits = hits + 1
			if hits != 0:
				n_temp.append(1)
			else:
				n_temp.append(0)

			keyword_hits = int(n_temp[1])*int(n_temp[2])
			keyword_hits_list.append(keyword_hits)
		
		keyword_counts = sum(keyword_hits_list)
		keyword_counter.append(keyword_counts)

		print("day {0} complete".format(a))

		if list_words != []:
			a = a + 1
		else:
			break
	
	df = pd.DataFrame({"id": pdf_names, "keywords": keyword_counter})
	df.to_csv(outfile_name, index=False)
def tika_extract_only_content(memory=None):
    tika_client = TikaApp(file_jar=TIKA_APP_JAR, memory_allocation=memory)
    output = tika_client.extract_only_content(path=test_zip)
    return output
 def __init__(self, file_process):
     # Cliente Tika que utiliza que carga el fichero jar cliente.
     self.tika_client = TikaApp(file_jar="tika-app-1.20.jar")
     self.file_process = file_process
def tika_content_type():
    tika_client = TikaApp(file_jar=TIKA_APP_JAR)
    output = tika_client.detect_content_type(path=test_zip)
    return output
Esempio n. 23
0
class TikaProcessing(AbstractProcessing):
    """ This class processes the output mail attachments to add
    Apache Tika analysis.

    Args:
        jar (string): path of Apache Tika App jar
        valid_content_types (list or set): list of contents types to analyze
        memory_allocation (string): memory to give to Apache Tika App
    """
    def __init__(self, **kwargs):
        super(TikaProcessing, self).__init__(**kwargs)

        # Init Tika
        self._tika_client = TikaApp(file_jar=self.jar,
                                    memory_allocation=self.memory_allocation)

    def __getattr__(self, name):
        try:
            return self._kwargs[name]
        except KeyError:
            # Default values
            if name in ("memory_allocation"):
                return None
            else:
                msg = "'{0}' object has no attribute '{1}'"
                raise AttributeError(msg.format(type(self).__name__, name))

    def __setattr__(self, name, value):
        super(TikaProcessing, self).__setattr__(name, value)

        if name == "valid_content_types":
            if not isinstance(value, set) and not isinstance(value, list):
                raise InvalidContentTypes("Content types must be set or list")

            self._kwargs[name] = value

    def _check_arguments(self):
        """This method checks if all mandatory arguments are given. """

        if 'jar' not in self._kwargs:
            msg = "Argument '{0}' not in object '{1}'"
            raise MissingArgument(msg.format('jar', type(self).__name__))

        if 'valid_content_types' not in self._kwargs:
            msg = "Argument '{0}' not in object '{1}'"
            raise MissingArgument(
                msg.format('valid_content_types',
                           type(self).__name__))

    def process(self, attachment):
        """This method updates the attachment result
        with the Tika output.

        Args:
            attachment (dict): dict with a raw attachment mail

        Returns:
            This method updates the attachment dict given
        """

        super(TikaProcessing, self).process(attachment)

        if attachment['Content-Type'] in self.valid_content_types:
            attachment['tika'] = self._tika_client.extract_all_content(
                payload=attachment['payload'], convert_to_obj=True)
Esempio n. 24
0
 def convert_Tika(self,fname):
     tika_client = TikaApp(file_jar=os.getcwd()+'/tika-app-1.20.jar')
     return tika_client.extract_only_content(fname)
Esempio n. 25
0
    def __init__(self, **kwargs):
        super(TikaProcessing, self).__init__(**kwargs)

        # Init Tika
        self._tika_client = TikaApp(file_jar=self.jar,
                                    memory_allocation=self.memory_allocation)
Esempio n. 26
0
# -*- coding: utf-8 -*- 
# @Time : 2020/12/11 10:25 
# @Author : 王西亚 
# @File : c_doc.py

from tikapp import TikaApp

tika_client = TikaApp(file_jar="/usr/local/Cellar/tika/1.24.1_1/libexec/tika-app-1.24.1.jar")
metadata = tika_client.extract_only_metadata("/Users/wangxiya/Downloads/000101020062805119-00.pdf")
print(type(metadata))
print(metadata)

# from tika import parser

# parsed = parser.from_file('/path/to/file')
# print(parsed["metadata"])
# print(parsed["content"])

# parsed = parser.from_file('/Users/wangxiya/Downloads/000101020062805119-00.pdf', 'http://localhost:9998/tika')
# metadata = parsed["metadata"]
# print(type(metadata))
# print(metadata)
 def extract_text(filename):
     jar_path = os.path.abspath(os.path.join("lib", "tika-app-1.28.jar"))
     tika_client = TikaApp(file_jar=jar_path)
     parsed = tika_client.extract_only_content(filename)
     return parsed
def tika_detect_language():
    tika_client = TikaApp(file_jar=TIKA_APP_JAR)
    output = tika_client.detect_language(path=test_zip)
    return output
Esempio n. 29
0
class FileBeats:
    # # 建立文件索引
    #只有如下扩展名的文件才会被索引文件内容
    export_content_exts = ('.md', '.html', '.htm', '.txt', '.ppt', '.pptx',
                           '.key', '.pdf', ".pages", ".doc", ".docx", '.py',
                           '.java')

    def __init__(self,
                 index,
                 es_host="localhost:9200",
                 file_jar='/Users/laofeng/es_home/tika-app-1.23.jar',
                 index_content=False,
                 create_index=True,
                 force_renew_index=False,
                 schema_file=None):

        self.tika_app = TikaApp(file_jar)
        self.es = Elasticsearch(es_host)
        self.index = index
        self.index_content = index_content

        #查询索引是否存在
        index_exist = self.es.indices.exists(index)

        #if not index_exist and not create_index:

        if not index_exist and create_index and schema_file:
            with open(schema_file, 'r', encoding='utf-8') as f:
                schema = json.load(f)
                self.es.indices.create(index, schema)

        #如果已经存在,且强制renew,先删除后,再建立
        if index_exist and force_renew_index:
            self.es.indices.delete(index)
            with open(schema_file, 'r', encoding='utf-8') as f:
                schema = json.load(f)
                self.es.indices.create(index, schema)

    # 格式化时间,参数是秒和时间格式
    @staticmethod
    def second2date(second, style="%Y-%m-%d %H:%M:%S"):
        time_array = time.localtime(second)
        date_str = time.strftime(style, time_array)
        return date_str

    def export_file_tags(self, abs_path):
        tags = {"path": abs_path}
        (basename, ext) = os.path.splitext(abs_path)
        tags['ext'] = ext.lstrip('.')  # 去掉了后缀的点
        tags['name'] = os.path.basename(abs_path)
        size = os.path.getsize(abs_path)
        tags['size'] = size

        # 过滤调太大或者太小的文件
        # 文件太大,二进制文件类型不导出content

        if self.index_content and ext.lower() in FileBeats.export_content_exts:
            try:
                r = self.tika_app.extract_only_content(
                    path=abs_path, payload="base64_payload")
                if r:
                    tags['content'] = r
            except Exception as e:
                traceback.print_exc()
        return tags

    # 索引文档
    def index_doc(self, tags, _type='_doc'):
        # index 相当于表名, body被索引的文本(分词)
        tags['timestamp'] = datetime.now()
        # 使用文件全路径做为id
        res = self.es.index(index=self.index,
                            doc_type=_type,
                            body=tags,
                            id=tags['path'])

    def index_docs(self, docs):
        for doc in docs:
            self.index_doc(doc)

    # 处理一个文件,先导出tags,然后索引文档
    def process_file(self, f):
        tags = self.export_file_tags(f)
        self.index_doc(tags)

    def beats_more(self, folders, asynchronous=True):
        print("开始索引文件", FileBeats.second2date(time.time()))
        for folder in folders:
            self.start_beats(folder)

        print("索引文件结束", FileBeats.second2date(time.time()))

    def start_beats(self,
                    source_dir='/Volumes/portable/sync/',
                    asynchronous=True):
        print("开始索引文件", source_dir, FileBeats.second2date(time.time()))
        # 遍历文件
        greenlets = list()
        #index_tasks = list()

        for folder, dirs, files in os.walk(source_dir, topdown=False):
            # 过滤掉一些文件夹

            if '@' in folder or '.svn' in folder or folder.endswith(
                    '.app') or "迅雷" in folder:
                print('忽略目录', folder)
                continue
            for f in files:
                if f.startswith("."):
                    continue
                abs_path = os.path.join(folder, f)
                try:
                    # process_file(abs_path)
                    if asynchronous:
                        greenlets.append(
                            gevent.spawn(self.export_file_tags, abs_path))
                    else:
                        tags = self.export_file_tags(abs_path)
                        self.index_doc(tags)
                    # 任务达到500个,执行一次
                    if len(greenlets) >= 5:
                        gevent.joinall(greenlets)
                        self.index_docs([g.value for g in greenlets])
                        # 使用并发es会出现一个read timeout或者是socket的错误
                        # index_tasks.append(gevent.spawn(index_docs,[g.value for g in greenlets]))
                        greenlets.clear()
                    # if len(index_tasks) > 50:
                    #     gevent.joinall(index_tasks)
                    #     index_tasks.clear()
                except Exception as e:
                    traceback.print_exc()
        # 清理不足5个文件的情况
        if asynchronous:
            gevent.joinall(greenlets)
            self.index_docs([g.value for g in greenlets])
        # index_tasks.append(gevent.spawn(index_docs, [g.value for g in greenlets]))
        # gevent.joinall(index_tasks)

        print("索引文件结束", source_dir, FileBeats.second2date(time.time()))
from tikapp import TikaApp
from image_mod import convert_image_to_string
import os

tika_client = TikaApp(
    os.path.join(os.path.dirname(os.path.realpath(__file__)), 'src',
                 'tika-app-1.22.jar'))


def receive_text_from_file(path: str, ext=None):
    text = tika_client.extract_only_content(path)
    if text == "" and ext == 'pdf':
        return convert_image_to_string(path, ext=ext)
    return text
Esempio n. 31
0
 def handle_backup_file(self, context, savepkt):
     DebugMessage(context, 100,
                  "handle_backup_file called with " + str(savepkt) + "\n")
     DebugMessage(
         context, 100,
         "fname: " + savepkt.fname + " Type: " + str(savepkt.type) + "\n")
     if (savepkt.type == bFileType['FT_REG']):
         DebugMessage(context, 100, "regulaer file, do something now...\n")
         # configure your Elasticsearch server here:
         es = Elasticsearch([{'host': '192.168.17.2', 'port': 9200}])
         # configure your TikaApp jar file here:
         try:
             tika_client = TikaApp(
                 file_jar="/usr/local/bin/tika-app-1.20.jar")
         except Exception as ex:
             JobMessage(
                 context, bJobMessageType['M_ERROR'],
                 'Error indexing %s. Tika error: %s' %
                 (savepkt.fname, str(ex)))
             return bRCs['bRC_OK']
         # tika_client has several parser options
         # Next one is for metadata only:
         #result_payload=tika_client.extract_only_metadata(savepkt.fname)
         # This one includes file contents as text:
         try:
             result_payload = tika_client.extract_all_content(savepkt.fname)
         except Exception as ex:
             JobMessage(
                 context, bJobMessageType['M_ERROR'],
                 'Error extracting contents from %s. Tika error: %s' %
                 (savepkt.fname, str(ex)))
             return bRCs['bRC_OK']
         # result_payload is a list of json-strings. Nested structes like
         # tar-files or emails with attachments or inline documents are
         # returned as distinct json string.
         # The first string [0] contains information for the main file
         # TODO: care about nested structures, for now we only takte the first/main file
         try:
             data = json.loads(result_payload)[0]
         except Exception as ex:
             JobMessage(
                 context, bJobMessageType['M_ERROR'],
                 'Error reading json fields delivered by Tika examining file %s. Json error: %s'
                 % (savepkt.fname, str(ex)))
             return bRCs['bRC_OK']
         # Tika eventually adds "Unkonwn Tags (id)", with id as increasing number, which
         # could lead to exceed the keyword limit in elasticsearch indices, we
         # remove those tags
         for data_keyword in data.keys():
             if data_keyword.startswith("Unknown tag ("):
                 del data[data_keyword]
         # Tika adds some emptylines at the beginning of content, we strip it here
         if 'X-TIKA:content' in data:
             data['X-TIKA:content'] = data['X-TIKA:content'].strip()
         data['bareos_jobId'] = self.jobId
         data['bareos_fdname'] = self.fdname
         data['bareos_joblevel'] = unichr(self.level)
         data['bareos_directory'] = os.path.dirname(savepkt.fname)
         try:
             esRes = es.index(index="bareos-test",
                              doc_type='_doc',
                              body=data)
         except Exception as ex:
             JobMessage(
                 context, bJobMessageType['M_ERROR'],
                 'Error indexing %s. Elastic error: %s' %
                 (savepkt.fname, str(ex)))
     return bRCs['bRC_OK']