Ejemplo n.º 1
0
def index_documents(options, args):
    for arg in args:
        mime_type = TikaParser.get_mime_type(arg)

        (text, meta) = TikaParser.parse(arg)

        if not text.keys():
            if 'text/plain' in meta['content_type']:
                (text, txt_meta) = TextParser.parse(arg)
                meta.update(txt_meta)
            elif 'vnd.oasis.opendocument' in meta['content_type']:
                (text, od_meta) = OpenDocumentParser.parse(arg)
                meta.update(od_meta)

        (mpty, fs_meta) = FsMetaParser.parse(arg)
        meta.update(fs_meta)

        meta['content_type'] = mime_type

        for field in meta:
            print("{}: {}".format(field, meta.get(field)))

        # if meta.get('content_type', '') == 'application/pdf':
        #     es_index(text, meta, doctype='pdf', options=options)

        print(text)
        exit()
Ejemplo n.º 2
0
 def get_terms(self, query_text):
     query_terms = TextParser.parse(query_text)
     query_term_ids = [self.term_store.get_id_for_term(term) for term in
             query_terms]
     return query_term_ids