Esempio n. 1
0
 def ingest(self, meta, local_path):
     work_dir = make_tempdir()
     try:
         bin_path = os.environ.get('READPST_BIN', 'readpst')
         args = [
             bin_path, '-D', '-e', '-8', '-b', '-o', work_dir, local_path
         ]
         log.debug('Converting Outlook PST file: %r', ' '.join(args))
         subprocess.call(args)
         for (dirpath, dirnames, filenames) in os.walk(work_dir):
             reldir = os.path.relpath(string_value(dirpath),
                                      string_value(work_dir))
             for filename in filenames:
                 filename = string_value(filename)
                 child = meta.make_child()
                 for kw in reldir.split(os.path.sep):
                     child.add_keyword(kw)
                 fid = os.path.join(string_value(meta.foreign_id),
                                    string_value(reldir), filename)
                 child.foreign_id = string_value(fid)
                 file_path = os.path.join(string_value(dirpath), filename)
                 ingest_file(self.collection_id,
                             child,
                             file_path,
                             move=True)
     finally:
         remove_tempdir(work_dir)
Esempio n. 2
0
def ingest_upload(collection_id):
    collection = obj_or_404(Collection.by_id(collection_id))
    request.authz.require(request.authz.collection_write(collection.id))
    log_event(request)
    try:
        meta = json.loads(request.form.get('meta', '{}'))
        meta['crawler_id'] = 'user_upload:%s' % request.authz.role.id
        meta['crawler_run'] = make_textid()

    except Exception as ex:
        raise BadRequest(unicode(ex))

    metas = []
    for storage in request.files.values():
        file_meta = meta.copy()
        file_meta['mime_type'] = storage.mimetype
        file_meta['file_name'] = storage.filename
        file_meta['source_path'] = storage.filename
        validate(file_meta, 'metadata.json#')
        file_meta = Metadata.from_data(file_meta)
        sec_fn = os.path.join(upload_folder, secure_filename(storage.filename))
        storage.save(sec_fn)
        ingest_file(collection_id, file_meta, sec_fn, move=True,
                    queue=USER_QUEUE, routing_key=USER_ROUTING_KEY)
        metas.append(file_meta)
    return jsonify({'status': 'ok', 'metadata': metas})
Esempio n. 3
0
def ingest_upload(collection_id):
    collection = obj_or_404(Collection.by_id(collection_id))
    authz.require(authz.collection_write(collection.id))
    log_event(request)
    try:
        meta = json.loads(request.form.get('meta', '{}'))
    except Exception as ex:
        raise BadRequest(unicode(ex))

    metas = []
    for storage in request.files.values():
        file_meta = meta.copy()
        file_meta['mime_type'] = storage.mimetype
        file_meta['file_name'] = storage.filename
        validate(file_meta, 'metadata.json#')
        file_meta = Metadata.from_data(file_meta)
        file_meta.crawler_id = 'user_upload:%s' % request.auth_role.id
        file_meta.crawler_run = make_textid()
        sec_fn = os.path.join(get_upload_folder(),
                              secure_filename(storage.filename))
        storage.save(sec_fn)
        ingest_file(collection.id, file_meta, sec_fn, move=True,
                    queue=USER_QUEUE, routing_key=USER_ROUTING_KEY)
        metas.append(file_meta)
    return jsonify({'status': 'ok', 'metadata': metas})
Esempio n. 4
0
    def crawl_query(self, engine, collection, meta_base, name, query):
        meta_ = meta_base.copy()
        meta_.update(query.get('meta', {}))
        meta = self.make_meta(meta_)
        meta.extension = 'csv'
        meta.mime_type = 'text/csv'
        meta.data.update(meta_)
        meta.foreign_id = '%s:%s' % (collection.foreign_id, name)

        query = SQLQuery(engine, query)

        fh, file_path = mkstemp(suffix='.csv')
        try:
            fh = os.fdopen(fh, 'w')
            headers = [query.alias(c) for c in query.columns]
            writer = unicodecsv.writer(fh, quoting=unicodecsv.QUOTE_ALL)
            writer.writerow(headers)
            log.info('Query: %s', query.query)
            rp = engine.execute(query.query)
            while True:
                rows = rp.fetchmany(10000)
                if not rows:
                    break
                for row in rows:
                    writer.writerow(row[h] for h in headers)
            fh.close()
            ingest_file(collection.id, meta, file_path, move=True)
        finally:
            if os.path.isfile(file_path):
                os.unlink(file_path)
Esempio n. 5
0
 def emit_file(self, meta, file_path):
     child = meta.clone()
     child.clear('extension')
     child.clear('file_name')
     child.clear('content_hash')
     child.clear('mime_type')
     # child.clear('foreign_id')
     child.parent = meta.clone()
     ingest_file(self.collection_id, child, file_path)
Esempio n. 6
0
 def ingest_message(self, filepath, meta):
     child = meta.clone()
     child.clear('title')
     child.clear('extension')
     child.clear('file_name')
     child.clear('mime_type')
     child.parent = meta.clone()
     child.source_path = filepath
     ingest_file(self.source_id, child, filepath, move=True)
Esempio n. 7
0
 def emit_file(self, meta, file_path):
     child = meta.clone()
     child.clear('extension')
     child.clear('file_name')
     child.clear('content_hash')
     child.clear('mime_type')
     # child.clear('foreign_id')
     child.parent = meta.clone()
     ingest_file(self.collection_id, child, file_path)
Esempio n. 8
0
 def crawl_file(self, source, file_path, base_meta):
     try:
         meta = self.make_meta(base_meta)
         file_path = string_value(file_path)
         meta.foreign_id = file_path
         meta.source_path = file_path
         meta.file_name = os.path.basename(file_path)
         ingest_file(source.id, meta, file_path, move=False)
     except Exception as ex:
         log.exception(ex)
Esempio n. 9
0
 def crawl_file(self, source_id, file_path, base_meta):
     try:
         if not os.path.isfile(file_path):
             log.info('Invalid file path: %r', file_path)
             return
         meta = self.make_meta(base_meta)
         file_path = string_value(file_path)
         meta.foreign_id = file_path
         meta.source_path = file_path
         meta.file_name = os.path.basename(file_path)
         ingest_file(source_id, meta, file_path, move=False)
     except Exception as ex:
         log.exception(ex)
Esempio n. 10
0
 def crawl_file(self, collection_id, file_path, base_meta):
     try:
         if not os.path.isfile(file_path):
             log.info('Invalid file path: %r', file_path)
             return
         meta = self.make_meta(base_meta)
         file_path = string_value(file_path)
         meta.foreign_id = file_path
         meta.source_path = file_path
         meta.file_name = os.path.basename(file_path)
         ingest_file(collection_id, meta, file_path, move=False)
     except Exception as ex:
         log.exception(ex)
Esempio n. 11
0
    def ingest_attachment(self, part, meta):
        if part.body is None:
            log.warning("Empty attachment [%r]: %s", meta, part)
            return

        child = meta.make_child()
        child.mime_type = six.text_type(part.detected_content_type)
        child.file_name = string_value(part.detected_file_name)
        out_path = self.write_temp(part.body, child.extension)

        try:
            ingest_file(self.collection_id, child, out_path, move=True)
        finally:
            remove_tempfile(out_path)
Esempio n. 12
0
 def ingest_attachment(self, attachment, meta):
     try:
         if attachment.data is None:
             log.warning("Attachment is empty [%r]: %s", meta, attachment.longFilename)
             return
         out_path = make_tempfile()
         with open(out_path, "w") as fh:
             fh.write(attachment.data)
         child = meta.make_child()
         child.file_name = attachment.longFilename
         ingest_file(self.collection_id, child, out_path, move=True)
         remove_tempfile(out_path)
     except Exception as ex:
         log.exception(ex)
Esempio n. 13
0
 def ingest_attachment(self, attachment, meta):
     try:
         if attachment.data is None:
             log.warning("Attachment is empty [%r]: %s", meta,
                         attachment.longFilename)
             return
         out_path = make_tempfile()
         with open(out_path, 'w') as fh:
             fh.write(attachment.data)
         child = meta.make_child()
         child.file_name = string_value(attachment.longFilename)
         ingest_file(self.collection_id, child, out_path, move=True)
         remove_tempfile(out_path)
     except Exception as ex:
         log.exception(ex)
Esempio n. 14
0
    def emit_member(self, meta, name, fh, temp_dir):
        try:
            file_name = os.path.basename(os.path.normpath(name))
            file_path = os.path.join(temp_dir, file_name)
            child = meta.clone()
            child.clear('title')
            child.clear('extension')
            child.clear('file_name')
            child.clear('mime_type')
            child.parent = meta.clone()
            child.file_name = file_name
            child.source_path = name

            with open(file_path, 'wb') as dst:
                shutil.copyfileobj(fh, dst)
            ingest_file(self.source_id, child, file_path)
        except Exception as ex:
            log.exception(ex)
Esempio n. 15
0
    def emit_member(self, meta, name, fh, temp_dir):
        file_name = os.path.basename(os.path.normpath(name))
        file_path = os.path.join(temp_dir, file_name)
        child = meta.clone()
        child.clear('title')
        child.clear('extension')
        child.clear('file_name')
        child.clear('content_hash')
        child.clear('mime_type')
        child.clear('foreign_id')
        child.parent = meta.clone()
        child.file_name = file_name
        child.source_path = name
        child.foreign_id = '%s:%s' % (meta.foreign_id, name)

        with open(file_path, 'wb') as dst:
            shutil.copyfileobj(fh, dst)
        ingest_file(self.source_id, child, file_path, move=True)
Esempio n. 16
0
    def emit_member(self, meta, name, fh, temp_dir):
        try:
            file_name = os.path.basename(os.path.normpath(name))
            file_path = os.path.join(temp_dir, file_name)
            child = meta.clone()
            child.clear('title')
            child.clear('extension')
            child.clear('file_name')
            child.clear('mime_type')
            child.parent = meta.clone()
            child.file_name = file_name
            child.source_path = name

            with open(file_path, 'wb') as dst:
                shutil.copyfileobj(fh, dst)
            ingest_file(self.source_id, child, file_path)
        except Exception as ex:
            log.exception(ex)
Esempio n. 17
0
    def ingest_attachment(self, part, meta):
        name, ext = os.path.splitext(part.detected_file_name)
        if len(ext):
            ext = ext.strip().lower()
        body = part.body
        if body is None:
            return
        out_path = self.write_temp(body, ext)
        child = meta.make_child()
        child.file_name = part.detected_file_name
        child.mime_type = part.detected_content_type

        # Weird outlook RTF representations -- do we want them?
        if child.file_name == 'rtf-body.rtf':
            return

        ingest_file(self.collection_id, child, out_path, move=True)
        remove_tempfile(out_path)
Esempio n. 18
0
    def emit_member(self, meta, name, fh, temp_dir):
        file_name = os.path.basename(os.path.normpath(name))
        file_path = os.path.join(temp_dir, file_name)
        child = meta.clone()
        child.clear('title')
        child.clear('extension')
        child.clear('file_name')
        child.clear('content_hash')
        child.clear('mime_type')
        child.clear('foreign_id')
        child.parent = meta.clone()
        child.file_name = file_name
        child.source_path = name
        child.foreign_id = '%s:%s' % (meta.foreign_id, name)

        with open(file_path, 'wb') as dst:
            shutil.copyfileobj(fh, dst)
        ingest_file(self.collection_id, child, file_path, move=True)
Esempio n. 19
0
 def ingest(self, meta, local_path):
     work_dir = make_tempdir()
     try:
         bin_path = os.environ.get('READPST_BIN', 'readpst')
         args = [bin_path, '-D', '-e', '-o', work_dir, local_path]
         log.debug('Converting Outlook PST file: %r', ' '.join(args))
         subprocess.call(args)
         for (dirpath, dirnames, filenames) in os.walk(work_dir):
             reldir = os.path.relpath(dirpath, work_dir)
             for filename in filenames:
                 child = meta.make_child()
                 for kw in reldir.split(os.path.sep):
                     child.add_keyword(kw)
                 child.foreign_id = os.path.join(meta.foreign_id, reldir,
                                                 filename)
                 ingest_file(self.source_id, meta,
                             os.path.join(dirpath, filename), move=True)
     finally:
         remove_tempdir(work_dir)
Esempio n. 20
0
 def emit_bundle(self, meta, directory, files):
     bundle = meta.make_child()
     if meta.foreign_id:
         bundle.source_path = os.path.join(meta.foreign_id,
                                           self.BUNDLE_EXTENSION)
     bundle.mime_type = self.BUNDLE_MIME
     bundle.file_name = '%s.%s' % (meta.file_name,
                                   self.BUNDLE_EXTENSION)
     log.info("Creating bundle: %r", bundle.file_name)
     temp_dir = make_tempdir()
     try:
         bundle_path = os.path.join(temp_dir, bundle.file_name)
         with ZipFile(bundle_path, 'w', ZIP_STORED) as zf:
             for file_name in files:
                 file_path = os.path.join(directory, file_name)
                 zf.write(file_path, file_name)
         ingest_file(self.collection_id, bundle, bundle_path,
                     move=True)
     finally:
         remove_tempdir(temp_dir)
Esempio n. 21
0
    def crawl_item(self, item):
        coll_data = item.meta.get('source', {})
        coll_fk = coll_data.pop('foreign_id')
        if coll_fk is None:
            raise ValueError("No foreign_id for collection given: %r" % item)
        if coll_fk not in self.collections:
            label = coll_data.get('label', coll_fk)
            self.collections[coll_fk] = Collection.create({
                'foreign_id': coll_fk,
                'label': label
            })
            if coll_data.get('public'):
                Permission.grant_foreign(self.collections[coll_fk],
                                         Role.SYSTEM_GUEST,
                                         True, False)
            db.session.commit()

        log.info('Import: %r', item.identifier)
        meta = self.normalize_metadata(item)
        ingest_file(self.collections[coll_fk].id, meta,
                    item.data_path, move=False)
Esempio n. 22
0
    def crawl_item(self, item, source):
        source_data = item.meta.get('source', {})
        source_fk = source_data.pop('foreign_id', source)
        if source_fk is None:
            raise ValueError("No foreign_id for source given: %r" % item)
        if source_fk not in self.sources:
            label = source_data.get('label', source_fk)
            self.sources[source_fk] = Source.create({
                'foreign_id': source_fk,
                'label': label
            })
            if source_data.get('public'):
                Permission.grant_foreign(self.sources[source_fk],
                                         Role.SYSTEM_GUEST, True, False)
            db.session.commit()

        log.info('Import: %r', item.identifier)
        meta = self.normalize_metadata(item)
        ingest_file(self.sources[source_fk].id,
                    meta,
                    item.data_path,
                    move=False)
Esempio n. 23
0
    def ingest_attachment(self, part, meta):
        name, ext = os.path.splitext(part.detected_file_name)
        if len(ext):
            ext = ext.strip().lower()
        body = part.body
        if body is None:
            return
        out_path = self.write_temp(body, ext)
        child = meta.clone()
        child.clear('title')
        child.clear('extension')
        child.clear('mime_type')
        child.clear('file_name')
        child.parent = meta.clone()
        child.file_name = unicode(part.detected_file_name)
        child.mime_type = unicode(part.detected_content_type)

        # Weird outlook RTF representations -- do we want them?
        if child.file_name == 'rtf-body.rtf':
            return

        ingest_file(self.source_id, child, out_path, move=True)
Esempio n. 24
0
    def crawl_item(self, item):
        coll_data = item.meta.get('source', {})
        coll_fk = coll_data.pop('foreign_id')
        if coll_fk is None:
            raise ValueError("No foreign_id for collection given: %r" % item)
        if coll_fk not in self.collections:
            label = coll_data.get('label', coll_fk)
            self.collections[coll_fk] = Collection.create({
                'foreign_id': coll_fk,
                'label': label,
                'managed': True
            })
            if coll_data.get('public'):
                Permission.grant_foreign(self.collections[coll_fk],
                                         Role.SYSTEM_GUEST,
                                         True, False)
            db.session.commit()

        log.info('Import: %r', item.identifier)
        meta = self.normalize_metadata(item)
        ingest_file(self.collections[coll_fk].id, meta,
                    item.data_path, move=False)
Esempio n. 25
0
    def ingest_attachment(self, part, meta):
        name, ext = os.path.splitext(part.detected_file_name)
        if len(ext):
            ext = ext.strip().lower()
        body = part.body
        if body is None:
            return
        out_path = self.write_temp(body, ext)
        child = meta.clone()
        child.clear('title')
        child.clear('extension')
        child.clear('mime_type')
        child.clear('file_name')
        child.parent = meta.clone()
        child.file_name = unicode(part.detected_file_name)
        child.mime_type = unicode(part.detected_content_type)

        # Weird outlook RTF representations -- do we want them?
        if child.file_name == 'rtf-body.rtf':
            return

        ingest_file(self.collection_id, child, out_path, move=True)
Esempio n. 26
0
def ingest_upload(collection_id):
    collection = obj_or_404(Collection.by_id(collection_id))
    authz.require(authz.collection_write(collection.id))
    log_event(request)
    try:
        meta = json.loads(request.form.get("meta", "{}"))
    except Exception as ex:
        raise BadRequest(unicode(ex))

    metas = []
    for storage in request.files.values():
        file_meta = meta.copy()
        file_meta["mime_type"] = storage.mimetype
        file_meta["file_name"] = storage.filename
        validate(file_meta, "metadata.json#")
        file_meta = Metadata.from_data(file_meta)
        file_meta.crawler_id = "user_upload:%s" % request.auth_role.id
        file_meta.crawler_run = make_textid()
        sec_fn = os.path.join(get_upload_folder(), secure_filename(storage.filename))
        storage.save(sec_fn)
        ingest_file(collection.id, file_meta, sec_fn, move=True)
        metas.append(file_meta)
    return jsonify({"status": "ok", "metadata": metas})
Esempio n. 27
0
 def emit_file(self, source, meta, file_path, move=False):
     db.session.commit()
     ingest_file(source.id, meta.clone(), file_path, move=move)
Esempio n. 28
0
 def emit_file(self, meta, file_path, move=False):
     ingest_file(self.collection.id, meta.clone(), file_path, move=move)
Esempio n. 29
0
 def emit_file(self, meta, file_path):
     ingest_file(self.source_id, meta, file_path)
Esempio n. 30
0
 def emit_file(self, meta, file_path, move=False):
     ingest_file(self.source.id, meta.clone(), file_path, move=move)
Esempio n. 31
0
 def emit_content(self, source, meta, content):
     db.session.commit()
     with NamedTemporaryFile() as fh:
         fh.write(content)
         ingest_file(source.id, meta.clone(), fh.name)
Esempio n. 32
0
File: crawler.py Progetto: 01-/aleph
 def emit_file(self, source, meta, file_path, move=False):
     db.session.commit()
     ingest_file(source.id, meta.clone(), file_path, move=move)
Esempio n. 33
0
File: crawler.py Progetto: 01-/aleph
 def emit_content(self, source, meta, content):
     db.session.commit()
     with NamedTemporaryFile() as fh:
         fh.write(content)
         ingest_file(source.id, meta.clone(), fh.name)
Esempio n. 34
0
 def emit_file(self, meta, file_path):
     ingest_file(self.source_id, meta, file_path)
Esempio n. 35
0
 def emit_file(self, meta, file_path, move=False):
     ingest_file(self.collection.id, meta.clone(), file_path, move=move)