def ingest(self, meta, local_path): work_dir = make_tempdir() try: bin_path = os.environ.get('READPST_BIN', 'readpst') args = [ bin_path, '-D', '-e', '-8', '-b', '-o', work_dir, local_path ] log.debug('Converting Outlook PST file: %r', ' '.join(args)) subprocess.call(args) for (dirpath, dirnames, filenames) in os.walk(work_dir): reldir = os.path.relpath(string_value(dirpath), string_value(work_dir)) for filename in filenames: filename = string_value(filename) child = meta.make_child() for kw in reldir.split(os.path.sep): child.add_keyword(kw) fid = os.path.join(string_value(meta.foreign_id), string_value(reldir), filename) child.foreign_id = string_value(fid) file_path = os.path.join(string_value(dirpath), filename) ingest_file(self.collection_id, child, file_path, move=True) finally: remove_tempdir(work_dir)
def ingest_upload(collection_id): collection = obj_or_404(Collection.by_id(collection_id)) request.authz.require(request.authz.collection_write(collection.id)) log_event(request) try: meta = json.loads(request.form.get('meta', '{}')) meta['crawler_id'] = 'user_upload:%s' % request.authz.role.id meta['crawler_run'] = make_textid() except Exception as ex: raise BadRequest(unicode(ex)) metas = [] for storage in request.files.values(): file_meta = meta.copy() file_meta['mime_type'] = storage.mimetype file_meta['file_name'] = storage.filename file_meta['source_path'] = storage.filename validate(file_meta, 'metadata.json#') file_meta = Metadata.from_data(file_meta) sec_fn = os.path.join(upload_folder, secure_filename(storage.filename)) storage.save(sec_fn) ingest_file(collection_id, file_meta, sec_fn, move=True, queue=USER_QUEUE, routing_key=USER_ROUTING_KEY) metas.append(file_meta) return jsonify({'status': 'ok', 'metadata': metas})
def ingest_upload(collection_id): collection = obj_or_404(Collection.by_id(collection_id)) authz.require(authz.collection_write(collection.id)) log_event(request) try: meta = json.loads(request.form.get('meta', '{}')) except Exception as ex: raise BadRequest(unicode(ex)) metas = [] for storage in request.files.values(): file_meta = meta.copy() file_meta['mime_type'] = storage.mimetype file_meta['file_name'] = storage.filename validate(file_meta, 'metadata.json#') file_meta = Metadata.from_data(file_meta) file_meta.crawler_id = 'user_upload:%s' % request.auth_role.id file_meta.crawler_run = make_textid() sec_fn = os.path.join(get_upload_folder(), secure_filename(storage.filename)) storage.save(sec_fn) ingest_file(collection.id, file_meta, sec_fn, move=True, queue=USER_QUEUE, routing_key=USER_ROUTING_KEY) metas.append(file_meta) return jsonify({'status': 'ok', 'metadata': metas})
def crawl_query(self, engine, collection, meta_base, name, query): meta_ = meta_base.copy() meta_.update(query.get('meta', {})) meta = self.make_meta(meta_) meta.extension = 'csv' meta.mime_type = 'text/csv' meta.data.update(meta_) meta.foreign_id = '%s:%s' % (collection.foreign_id, name) query = SQLQuery(engine, query) fh, file_path = mkstemp(suffix='.csv') try: fh = os.fdopen(fh, 'w') headers = [query.alias(c) for c in query.columns] writer = unicodecsv.writer(fh, quoting=unicodecsv.QUOTE_ALL) writer.writerow(headers) log.info('Query: %s', query.query) rp = engine.execute(query.query) while True: rows = rp.fetchmany(10000) if not rows: break for row in rows: writer.writerow(row[h] for h in headers) fh.close() ingest_file(collection.id, meta, file_path, move=True) finally: if os.path.isfile(file_path): os.unlink(file_path)
def emit_file(self, meta, file_path): child = meta.clone() child.clear('extension') child.clear('file_name') child.clear('content_hash') child.clear('mime_type') # child.clear('foreign_id') child.parent = meta.clone() ingest_file(self.collection_id, child, file_path)
def ingest_message(self, filepath, meta): child = meta.clone() child.clear('title') child.clear('extension') child.clear('file_name') child.clear('mime_type') child.parent = meta.clone() child.source_path = filepath ingest_file(self.source_id, child, filepath, move=True)
def crawl_file(self, source, file_path, base_meta): try: meta = self.make_meta(base_meta) file_path = string_value(file_path) meta.foreign_id = file_path meta.source_path = file_path meta.file_name = os.path.basename(file_path) ingest_file(source.id, meta, file_path, move=False) except Exception as ex: log.exception(ex)
def crawl_file(self, source_id, file_path, base_meta): try: if not os.path.isfile(file_path): log.info('Invalid file path: %r', file_path) return meta = self.make_meta(base_meta) file_path = string_value(file_path) meta.foreign_id = file_path meta.source_path = file_path meta.file_name = os.path.basename(file_path) ingest_file(source_id, meta, file_path, move=False) except Exception as ex: log.exception(ex)
def crawl_file(self, collection_id, file_path, base_meta): try: if not os.path.isfile(file_path): log.info('Invalid file path: %r', file_path) return meta = self.make_meta(base_meta) file_path = string_value(file_path) meta.foreign_id = file_path meta.source_path = file_path meta.file_name = os.path.basename(file_path) ingest_file(collection_id, meta, file_path, move=False) except Exception as ex: log.exception(ex)
def ingest_attachment(self, part, meta): if part.body is None: log.warning("Empty attachment [%r]: %s", meta, part) return child = meta.make_child() child.mime_type = six.text_type(part.detected_content_type) child.file_name = string_value(part.detected_file_name) out_path = self.write_temp(part.body, child.extension) try: ingest_file(self.collection_id, child, out_path, move=True) finally: remove_tempfile(out_path)
def ingest_attachment(self, attachment, meta): try: if attachment.data is None: log.warning("Attachment is empty [%r]: %s", meta, attachment.longFilename) return out_path = make_tempfile() with open(out_path, "w") as fh: fh.write(attachment.data) child = meta.make_child() child.file_name = attachment.longFilename ingest_file(self.collection_id, child, out_path, move=True) remove_tempfile(out_path) except Exception as ex: log.exception(ex)
def ingest_attachment(self, attachment, meta): try: if attachment.data is None: log.warning("Attachment is empty [%r]: %s", meta, attachment.longFilename) return out_path = make_tempfile() with open(out_path, 'w') as fh: fh.write(attachment.data) child = meta.make_child() child.file_name = string_value(attachment.longFilename) ingest_file(self.collection_id, child, out_path, move=True) remove_tempfile(out_path) except Exception as ex: log.exception(ex)
def emit_member(self, meta, name, fh, temp_dir): try: file_name = os.path.basename(os.path.normpath(name)) file_path = os.path.join(temp_dir, file_name) child = meta.clone() child.clear('title') child.clear('extension') child.clear('file_name') child.clear('mime_type') child.parent = meta.clone() child.file_name = file_name child.source_path = name with open(file_path, 'wb') as dst: shutil.copyfileobj(fh, dst) ingest_file(self.source_id, child, file_path) except Exception as ex: log.exception(ex)
def emit_member(self, meta, name, fh, temp_dir): file_name = os.path.basename(os.path.normpath(name)) file_path = os.path.join(temp_dir, file_name) child = meta.clone() child.clear('title') child.clear('extension') child.clear('file_name') child.clear('content_hash') child.clear('mime_type') child.clear('foreign_id') child.parent = meta.clone() child.file_name = file_name child.source_path = name child.foreign_id = '%s:%s' % (meta.foreign_id, name) with open(file_path, 'wb') as dst: shutil.copyfileobj(fh, dst) ingest_file(self.source_id, child, file_path, move=True)
def ingest_attachment(self, part, meta): name, ext = os.path.splitext(part.detected_file_name) if len(ext): ext = ext.strip().lower() body = part.body if body is None: return out_path = self.write_temp(body, ext) child = meta.make_child() child.file_name = part.detected_file_name child.mime_type = part.detected_content_type # Weird outlook RTF representations -- do we want them? if child.file_name == 'rtf-body.rtf': return ingest_file(self.collection_id, child, out_path, move=True) remove_tempfile(out_path)
def emit_member(self, meta, name, fh, temp_dir): file_name = os.path.basename(os.path.normpath(name)) file_path = os.path.join(temp_dir, file_name) child = meta.clone() child.clear('title') child.clear('extension') child.clear('file_name') child.clear('content_hash') child.clear('mime_type') child.clear('foreign_id') child.parent = meta.clone() child.file_name = file_name child.source_path = name child.foreign_id = '%s:%s' % (meta.foreign_id, name) with open(file_path, 'wb') as dst: shutil.copyfileobj(fh, dst) ingest_file(self.collection_id, child, file_path, move=True)
def ingest(self, meta, local_path): work_dir = make_tempdir() try: bin_path = os.environ.get('READPST_BIN', 'readpst') args = [bin_path, '-D', '-e', '-o', work_dir, local_path] log.debug('Converting Outlook PST file: %r', ' '.join(args)) subprocess.call(args) for (dirpath, dirnames, filenames) in os.walk(work_dir): reldir = os.path.relpath(dirpath, work_dir) for filename in filenames: child = meta.make_child() for kw in reldir.split(os.path.sep): child.add_keyword(kw) child.foreign_id = os.path.join(meta.foreign_id, reldir, filename) ingest_file(self.source_id, meta, os.path.join(dirpath, filename), move=True) finally: remove_tempdir(work_dir)
def emit_bundle(self, meta, directory, files): bundle = meta.make_child() if meta.foreign_id: bundle.source_path = os.path.join(meta.foreign_id, self.BUNDLE_EXTENSION) bundle.mime_type = self.BUNDLE_MIME bundle.file_name = '%s.%s' % (meta.file_name, self.BUNDLE_EXTENSION) log.info("Creating bundle: %r", bundle.file_name) temp_dir = make_tempdir() try: bundle_path = os.path.join(temp_dir, bundle.file_name) with ZipFile(bundle_path, 'w', ZIP_STORED) as zf: for file_name in files: file_path = os.path.join(directory, file_name) zf.write(file_path, file_name) ingest_file(self.collection_id, bundle, bundle_path, move=True) finally: remove_tempdir(temp_dir)
def crawl_item(self, item): coll_data = item.meta.get('source', {}) coll_fk = coll_data.pop('foreign_id') if coll_fk is None: raise ValueError("No foreign_id for collection given: %r" % item) if coll_fk not in self.collections: label = coll_data.get('label', coll_fk) self.collections[coll_fk] = Collection.create({ 'foreign_id': coll_fk, 'label': label }) if coll_data.get('public'): Permission.grant_foreign(self.collections[coll_fk], Role.SYSTEM_GUEST, True, False) db.session.commit() log.info('Import: %r', item.identifier) meta = self.normalize_metadata(item) ingest_file(self.collections[coll_fk].id, meta, item.data_path, move=False)
def crawl_item(self, item, source): source_data = item.meta.get('source', {}) source_fk = source_data.pop('foreign_id', source) if source_fk is None: raise ValueError("No foreign_id for source given: %r" % item) if source_fk not in self.sources: label = source_data.get('label', source_fk) self.sources[source_fk] = Source.create({ 'foreign_id': source_fk, 'label': label }) if source_data.get('public'): Permission.grant_foreign(self.sources[source_fk], Role.SYSTEM_GUEST, True, False) db.session.commit() log.info('Import: %r', item.identifier) meta = self.normalize_metadata(item) ingest_file(self.sources[source_fk].id, meta, item.data_path, move=False)
def ingest_attachment(self, part, meta): name, ext = os.path.splitext(part.detected_file_name) if len(ext): ext = ext.strip().lower() body = part.body if body is None: return out_path = self.write_temp(body, ext) child = meta.clone() child.clear('title') child.clear('extension') child.clear('mime_type') child.clear('file_name') child.parent = meta.clone() child.file_name = unicode(part.detected_file_name) child.mime_type = unicode(part.detected_content_type) # Weird outlook RTF representations -- do we want them? if child.file_name == 'rtf-body.rtf': return ingest_file(self.source_id, child, out_path, move=True)
def crawl_item(self, item): coll_data = item.meta.get('source', {}) coll_fk = coll_data.pop('foreign_id') if coll_fk is None: raise ValueError("No foreign_id for collection given: %r" % item) if coll_fk not in self.collections: label = coll_data.get('label', coll_fk) self.collections[coll_fk] = Collection.create({ 'foreign_id': coll_fk, 'label': label, 'managed': True }) if coll_data.get('public'): Permission.grant_foreign(self.collections[coll_fk], Role.SYSTEM_GUEST, True, False) db.session.commit() log.info('Import: %r', item.identifier) meta = self.normalize_metadata(item) ingest_file(self.collections[coll_fk].id, meta, item.data_path, move=False)
def ingest_attachment(self, part, meta): name, ext = os.path.splitext(part.detected_file_name) if len(ext): ext = ext.strip().lower() body = part.body if body is None: return out_path = self.write_temp(body, ext) child = meta.clone() child.clear('title') child.clear('extension') child.clear('mime_type') child.clear('file_name') child.parent = meta.clone() child.file_name = unicode(part.detected_file_name) child.mime_type = unicode(part.detected_content_type) # Weird outlook RTF representations -- do we want them? if child.file_name == 'rtf-body.rtf': return ingest_file(self.collection_id, child, out_path, move=True)
def ingest_upload(collection_id): collection = obj_or_404(Collection.by_id(collection_id)) authz.require(authz.collection_write(collection.id)) log_event(request) try: meta = json.loads(request.form.get("meta", "{}")) except Exception as ex: raise BadRequest(unicode(ex)) metas = [] for storage in request.files.values(): file_meta = meta.copy() file_meta["mime_type"] = storage.mimetype file_meta["file_name"] = storage.filename validate(file_meta, "metadata.json#") file_meta = Metadata.from_data(file_meta) file_meta.crawler_id = "user_upload:%s" % request.auth_role.id file_meta.crawler_run = make_textid() sec_fn = os.path.join(get_upload_folder(), secure_filename(storage.filename)) storage.save(sec_fn) ingest_file(collection.id, file_meta, sec_fn, move=True) metas.append(file_meta) return jsonify({"status": "ok", "metadata": metas})
def emit_file(self, source, meta, file_path, move=False): db.session.commit() ingest_file(source.id, meta.clone(), file_path, move=move)
def emit_file(self, meta, file_path, move=False): ingest_file(self.collection.id, meta.clone(), file_path, move=move)
def emit_file(self, meta, file_path): ingest_file(self.source_id, meta, file_path)
def emit_file(self, meta, file_path, move=False): ingest_file(self.source.id, meta.clone(), file_path, move=move)
def emit_content(self, source, meta, content): db.session.commit() with NamedTemporaryFile() as fh: fh.write(content) ingest_file(source.id, meta.clone(), fh.name)