def to_dict(self): data = self.to_dict_dates() data.update({ 'id': stringify(self.id), 'query': self.query, 'normalized': self.normalized, 'role_id': stringify(self.role_id), 'notified_at': self.notified_at }) return data
def to_dict(self): data = self.to_dict_dates() data.update({ 'id': stringify(self.id), 'role_id': stringify(self.role_id), 'collection_id': stringify(self.collection_id), 'read': self.read, 'write': self.write }) return data
def __init__(self, args, authz, limit=None): if not isinstance(args, MultiDict): args = OrderedMultiDict(args) self.args = args self.authz = authz self.offset = max(0, self.getint('offset', 0)) if limit is None: limit = min(MAX_PAGE, max(0, self.getint('limit', 20))) self.limit = limit self.text = stringify(self.get('q')) self.prefix = stringify(self.get('prefix')) # Disable or enable query caching self.cache = self.getbool('cache', settings.CACHE)
def items(self): for (key, value) in self.args.items(multi=True): if key == 'offset': continue value = stringify(value, encoding='utf-8') if value is not None: yield key, value
def getlist(self, name, default=None): values = [] for value in self.args.getlist(name): value = stringify(value, encoding='utf-8') if value is not None: values.append(value) return values or (default or [])
def ingest_upload(collection_id): require(request.authz.can(collection_id, request.authz.WRITE)) sync = get_flag('sync') meta, foreign_id = _load_metadata() parent_id = _load_parent(collection_id, meta) upload_dir = mkdtemp(prefix='aleph.upload.') try: path = None content_hash = None for storage in request.files.values(): path = safe_filename(storage.filename, default='upload') path = os.path.join(upload_dir, path) storage.save(path) content_hash = checksum(path) document = Document.by_keys(collection_id=collection_id, parent_id=parent_id, foreign_id=foreign_id, content_hash=content_hash) document.update(meta) document.schema = Document.SCHEMA if content_hash is None: document.schema = Document.SCHEMA_FOLDER ingest_document(document, path, role_id=request.authz.id, content_hash=content_hash) finally: shutil.rmtree(upload_dir) if document.collection.casefile: # Make sure collection counts are always accurate. update_document(document, sync=sync) return jsonify({ 'status': 'ok', 'id': stringify(document.id) }, status=201)
def update(self, data): self.name = data.get('name', self.name) self.is_muted = data.get('is_muted', self.is_muted) if data.get('password'): self.set_password(data.get('password')) self.locale = stringify(data.get('locale', self.locale)) self.updated_at = datetime.utcnow()
def get(stub, clazz, key): """Retrieve an object that has been loaded (or None).""" _instrument_stub(stub) key = stringify(key) if key is None: return return stub._rx_cache.get((clazz, key))
def getlist(self, name, default=None): values = [] for value in self.args.getlist(name): value = stringify(value, encoding='utf-8') if value: values.append(value) return values or (default or [])
def ingest_upload(collection_id): collection = get_db_collection(collection_id, request.authz.WRITE) meta, foreign_id = _load_metadata() parent = _load_parent(collection, meta) upload_dir = ensure_path(mkdtemp(prefix='aleph.upload.')) try: content_hash = None for storage in request.files.values(): path = safe_filename(storage.filename, default='upload') path = upload_dir.joinpath(path) storage.save(str(path)) content_hash = archive.archive_file(path) document = Document.save(collection=collection, parent=parent, foreign_id=foreign_id, content_hash=content_hash, meta=meta, uploader_id=request.authz.id) db.session.commit() proxy = document.to_proxy() ingest_entity(collection, proxy) finally: shutil.rmtree(upload_dir) return jsonify({ 'status': 'ok', 'id': stringify(document.id) }, status=201)
def ingest(self, file_path, entity): entity.schema = model.get('Email') msg = Message(file_path) self.extract_olefileio_metadata(msg, entity) # This property information was sourced from # http://www.fileformat.info/format/outlookmsg/index.htm # on 2013-07-22. headers = msg.getField('007D') if headers is not None: try: msg_headers = Parser().parsestr(headers, headersonly=True) self.extract_msg_headers(entity, msg_headers) except Exception: log.exception("Cannot parse Outlook-stored headers") entity.add('bodyText', msg.getField('1000')) entity.add('messageId', msg.getField('1035')) entity.add('subject', msg.getField('0037')) entity.add('threadTopic', msg.getField('0070')) # sender name and email sender = self.get_identity(msg.getField('0C1A'), msg.getField('0C1F')) self.apply_identities(entity, sender, 'emitters', 'sender') # received by sender = self.get_identity(msg.getField('0040'), msg.getField('0076')) self.apply_identities(entity, sender, 'recipients') froms = self.get_identities(msg.getField('1046')) self.apply_identities(entity, froms, 'emitters', 'from') tos = self.get_identities(msg.getField('0E04')) self.apply_identities(entity, tos, 'recipients', 'to') ccs = self.get_identities(msg.getField('0E03')) self.apply_identities(entity, ccs, 'recipients', 'cc') bccs = self.get_identities(msg.getField('0E02')) self.apply_identities(entity, bccs, 'recipients', 'bcc') self.resolve_message_ids(entity) for attachment in msg.attachments: name = stringify(attachment.longFilename) name = name or stringify(attachment.shortFilename) self.ingest_attachment(entity, name, attachment.mimeType, attachment.data)
def to_dict(self): data = self.to_dict_dates() data['category'] = self.DEFAULT_CATEGORY if self.category in self.CATEGORIES: data['category'] = self.category data['frequency'] = self.DEFAULT_FREQUENCY if self.frequency in self.FREQUENCIES: data['frequency'] = self.frequency data['kind'] = 'casefile' if self.casefile else 'source' data.update({ 'id': stringify(self.id), 'collection_id': stringify(self.id), 'foreign_id': self.foreign_id, 'creator_id': stringify(self.creator_id), 'team_id': self.team_id, 'label': self.label, 'summary': self.summary, 'publisher': self.publisher, 'publisher_url': self.publisher_url, 'info_url': self.info_url, 'data_url': self.data_url, 'casefile': self.casefile, 'secret': self.secret, 'xref': self.xref, 'restricted': self.restricted, 'countries': registry.country.normalize_set(self.countries), 'languages': registry.language.normalize_set(self.languages), }) return data
def update(self, data): self.name = data.get("name", self.name) self.is_muted = data.get("is_muted", self.is_muted) self.is_tester = data.get("is_tester", self.is_tester) if data.get("password"): self.set_password(data.get("password")) self.locale = stringify(data.get("locale", self.locale)) self.touch()
def ingest(self, file_path, entity): entity.schema = model.get('Email') try: doc = self.parse_xml_path(file_path) except TypeError as te: raise ProcessingException("Cannot parse OPF XML file.") from te if len(doc.findall('//email')) != 1: raise ProcessingException("More than one email in file.") email = doc.find('//email') props = email.getchildren() props = {c.tag: stringify(c.text) for c in props if c.text} # from pprint import pformat # log.info(pformat(props)) entity.add('subject', props.pop('OPFMessageCopySubject', None)) entity.add('threadTopic', props.pop('OPFMessageCopyThreadTopic', None)) entity.add('summary', props.pop('OPFMessageCopyPreview', None)) # message IDs are already parsed, no need to clean prior: entity.add('messageId', props.pop('OPFMessageCopyMessageID', None)) entity.add('date', self.get_date(props, 'OPFMessageCopySentTime')) entity.add('modifiedAt', self.get_date(props, 'OPFMessageCopyModDate')) senders = self.get_contacts(email, 'OPFMessageCopySenderAddress') self.apply_identities(entity, senders, 'emitters', 'sender') froms = self.get_contacts(email, 'OPFMessageCopyFromAddresses') self.apply_identities(entity, froms, 'emitters', 'from') tos = self.get_contacts(email, 'OPFMessageCopyToAddresses') self.apply_identities(entity, tos, 'recipients', 'to') ccs = self.get_contacts(email, 'OPFMessageCopyCCAddresses') self.apply_identities(entity, ccs, 'recipients', 'cc') bccs = self.get_contacts(email, 'OPFMessageCopyBCCAddresses') self.apply_identities(entity, bccs, 'recipients', 'bcc') entity.add('bodyText', props.pop('OPFMessageCopyBody', None)) html = props.pop('OPFMessageCopyHTMLBody', None) has_html = '1E0' == props.pop('OPFMessageGetHasHTML', None) if has_html and stringify(html): self.extract_html_content(entity, html, extract_metadata=False) self.resolve_message_ids(entity)
def _field_values(self, el, name): query = './ns:field[@name="%s"]/ns:value/text()' % name values = [] for value in el.xpath(query, namespaces=self.NSMAP): value = stringify(value) if value is not None: values.append(value) return list(sorted(values))
def match_regexp(self, value, q, strict=False): """if value matches a regexp q""" value = stringify(value) mr = re.compile(q) if value is not None: if mr.match(value): return self.shout("%r not matching the regexp %r", strict, value, q)
def __init__(self, origin, query_uid=None, match_uid=None): self.origin = stringify(origin) if self.origin is None: raise ValueError("Invalid origin") self.log = logging.getLogger('%s.%s' % (project.name, self.origin)) self.query_uid = query_uid self.match_uid = match_uid
def sanitize_text(text, encoding=DEFAULT_ENCODING): text = stringify(text, encoding_default=encoding) text = remove_unsafe_chars(text) if text is None: return text = text.encode(encoding, 'replace') text = text.decode(encoding, 'strict') return text
def key_bytes(key: Any) -> bytes: """Convert the given data to a value appropriate for hashing.""" if isinstance(key, bytes): return key text = stringify(key) if text is None: return b"" return text.encode("utf-8")
def _serialize(self, obj): pk = obj.get('id') obj['id'] = str(pk) obj['links'] = {'self': url_for('alerts_api.view', alert_id=pk)} role_id = obj.pop('role_id', None) obj['writeable'] = role_id == stringify(request.authz.id) # obj['role'] = self.resolve(Role, role_id, RoleSerializer) return obj
def file_name(self): disposition = self.headers.get('content-disposition') file_name = None if disposition is not None: _, options = cgi.parse_header(disposition) filename = options.get('filename') or '' file_name = stringify(unquote(filename)) return file_name
def ingest(self, file_path, entity): entity.schema = model.get("Email") try: doc = self.parse_xml_path(file_path) except TypeError as te: raise ProcessingException("Cannot parse OPF XML file.") from te if len(doc.findall("//email")) != 1: raise ProcessingException("More than one email in file.") email = doc.find("//email") props = email.getchildren() props = {c.tag: stringify(c.text) for c in props if c.text} # from pprint import pformat # log.info(pformat(props)) entity.add("subject", props.pop("OPFMessageCopySubject", None)) entity.add("threadTopic", props.pop("OPFMessageCopyThreadTopic", None)) entity.add("summary", props.pop("OPFMessageCopyPreview", None)) # message IDs are already parsed, no need to clean prior: entity.add("messageId", props.pop("OPFMessageCopyMessageID", None)) entity.add("date", self.get_date(props, "OPFMessageCopySentTime")) entity.add("modifiedAt", self.get_date(props, "OPFMessageCopyModDate")) senders = self.get_contacts(email, "OPFMessageCopySenderAddress") self.apply_identities(entity, senders, "emitters", "sender") froms = self.get_contacts(email, "OPFMessageCopyFromAddresses") self.apply_identities(entity, froms, "emitters", "from") tos = self.get_contacts(email, "OPFMessageCopyToAddresses") self.apply_identities(entity, tos, "recipients", "to") ccs = self.get_contacts(email, "OPFMessageCopyCCAddresses") self.apply_identities(entity, ccs, "recipients", "cc") bccs = self.get_contacts(email, "OPFMessageCopyBCCAddresses") self.apply_identities(entity, bccs, "recipients", "bcc") entity.add("bodyText", props.pop("OPFMessageCopyBody", None)) html = props.pop("OPFMessageCopyHTMLBody", None) has_html = "1E0" == props.pop("OPFMessageGetHasHTML", None) if has_html and stringify(html): self.extract_html_content(entity, html, extract_metadata=False) self.resolve_message_ids(entity)
def update(self, data, authz): self.label = data.get('label', self.label) self.summary = data.get('summary', self.summary) self.publisher = data.get('publisher', self.publisher) self.publisher_url = data.get('publisher_url', self.publisher_url) if self.publisher_url is not None: self.publisher_url = stringify(self.publisher_url) self.info_url = data.get('info_url', self.info_url) if self.info_url is not None: self.info_url = stringify(self.info_url) self.data_url = data.get('data_url', self.data_url) if self.data_url is not None: self.data_url = stringify(self.data_url) self.countries = ensure_list(data.get('countries', self.countries)) self.countries = [ registry.country.clean(val) for val in self.countries ] # noqa self.languages = ensure_list(data.get('languages', self.languages)) self.languages = [ registry.language.clean(val) for val in self.languages ] # noqa self.frequency = data.get('frequency', self.frequency) self.restricted = data.get('restricted', self.restricted) self.xref = data.get('xref', self.xref) # Some fields are editable only by admins in order to have # a strict separation between source evidence and case # material. if authz.is_admin: self.category = data.get('category', self.category) self.casefile = as_bool(data.get('casefile'), default=self.casefile) creator = ensure_dict(data.get('creator')) creator_id = data.get('creator_id', creator.get('id')) creator = Role.by_id(creator_id) if creator is not None: self.creator = creator if self.casefile: self.category = 'casefile' self.touch() db.session.flush() if self.creator is not None: Permission.grant(self, self.creator, True, True)
def records(self): """Iterate through the table applying filters on-the-go.""" for url in self.urls: for row in self.read_csv(url): data = {} for ref in self.query.refs: data[ref] = stringify(row.get(ref)) if self.check_filters(data): yield data
def queue(stub, clazz, key, schema=None): """Notify the resolver associated with `stub` that the given object needs to be retrieved. Multiple calls with the same object signature will be merged.""" _instrument_stub(stub) key = stringify(key) if key is None: return stub._rx_queue.add((clazz, key, schema))
def to_dict(self): data = self.to_dict_dates() data.update({ "id": stringify(self.id), "label": self.label, "operation": self.operation, "creator_id": stringify(self.creator_id), "collection_id": self.collection_id, "expires_at": self.expires_at, "deleted": self.deleted, "status": Status.LABEL.get(self.status), "content_hash": self.content_hash, "file_size": self.file_size, "file_name": self.file_name, "mime_type": self.mime_type, "meta": self.meta, }) return data
def normalize_mime_type(mime_type): """Clean up the mime type a bit.""" mime_type = stringify(mime_type) if mime_type is None: return None mime_type = mime_type.lower() if mime_type in ['application/octet-stream']: return None return mime_type
def is_integer(self, value, strict=False): """if value is an integer""" if value is not None: if isinstance(value, numbers.Number): return value = stringify(value) if value is not None and value.isnumeric(): return self.shout("value %r is not an integer", strict, value)
def validate(self, email, **kwargs): """Check to see if this is a valid email address.""" email = stringify(email) if email is None: return if not self.EMAIL_REGEX.match(email): return False mailbox, domain = email.rsplit('@', 1) return self.domains.validate(domain, **kwargs)
def parse(context, data): groups = {} res = context.http.rehash(data) with open(res.file_path, 'r', encoding='iso-8859-1') as csvfile: # ignore first line next(csvfile) for row in csv.DictReader(csvfile): group = int(float(row.pop('Group ID'))) if group not in groups: groups[group] = [] groups[group].append({ k: stringify(v) if stringify(v) is not None else '' for k, v in row.items() }) for group, rows in groups.items(): context.emit(data={'group': group, 'rows': rows})
def _generate_stream(): for row in iterable: values = [] for value in row: values.append(stringify(value) or '') buffer = io.StringIO() writer = csv.writer(buffer, dialect='excel', delimiter=',') writer.writerow(values) yield buffer.getvalue()
def store(self, key, value): key = stringify(key) if key is not None: self.table.upsert( { 'key': key, 'value': json.dumps(value), 'timestamp': datetime.utcnow() }, ['key'])
def clean(self, text, **kwargs): """Create a more clean, but still user-facing version of an instance of the type.""" text = stringify(text) if text is not None: try: return str(ip_address(text)) except ValueError: return None
def __init__( self, model: "Model", query: "QueryMapping", name: str, data: Dict[str, Any], key_prefix: Optional[str] = None, ) -> None: self.model = model self.name = name self.seed = sha1(key_bytes(key_prefix)) self.seed.update(key_bytes(data.get("key_literal"))) self.keys = keys_values(data, "key", "keys") self.id_column = stringify(data.get("id_column")) if not len(self.keys) and self.id_column is None: raise InvalidMapping("No keys or ID: %r" % name) if len(self.keys) and self.id_column is not None: msg = "Please use only keys or id_column, not both: %r" % name raise InvalidMapping(msg) schema_name = stringify(data.get("schema")) if schema_name is None: raise InvalidMapping("No schema: %s" % name) schema = model.get(schema_name) if schema is None: raise InvalidMapping("Invalid schema: %s" % schema_name) self.schema = schema self.refs = set(self.keys) if self.id_column: self.refs.add(self.id_column) self.dependencies: Set[str] = set() self.properties: List[PropertyMapping] = [] for name, prop_mapping in data.get("properties", {}).items(): prop = self.schema.get(name) if prop is None: raise InvalidMapping("Invalid property: %s" % name) mapping = PropertyMapping(query, prop_mapping, prop) self.properties.append(mapping) self.refs.update(mapping.refs) if mapping.entity: self.dependencies.add(mapping.entity)
def get_entity_id(obj: Any) -> Optional[str]: """Given an entity-ish object, try to get the ID.""" if is_mapping(obj): obj = obj.get("id") else: try: obj = obj.id except AttributeError: pass return stringify(obj)
def by_foreign_id(cls, foreign_id, collection_id, deleted=False): foreign_id = stringify(foreign_id) if foreign_id is None: return None q = cls.all(deleted=deleted) q = q.filter(Entity.collection_id == collection_id) foreign_id = func.cast([foreign_id], ARRAY(db.Unicode())) q = q.filter(cls.foreign_ids.contains(foreign_id)) q = q.order_by(Entity.deleted_at.desc().nullsfirst()) return q.first()
def to_dict(self): proxy = self.to_proxy() data = proxy.to_full_dict() data.update(self.to_dict_dates()) data.update({ 'foreign_id': self.foreign_id, 'collection_id': stringify(self.collection_id), 'bulk': False }) return data
def emit(self, text, type, key=None, weight=1): "Create a tag, this can be called multiple times with the same text." cleaner = self.CLEANERS[type] text = stringify(text) text = cleaner.clean(text, countries=self.document.countries) if text is None: return key = stringify(key) if key is None: key = slugify(text, sep='-') if key is None: return if (key, type) not in self.keyed: self.keyed[(key, type)] = dict(text=text, weight=weight) else: self.keyed[(key, type)]['weight'] += weight
def team_id(self): role = aliased(Role) perm = aliased(Permission) q = db.session.query(role.id) q = q.filter(role.type != Role.SYSTEM) q = q.filter(role.id == perm.role_id) q = q.filter(perm.collection_id == self.id) q = q.filter(perm.read == True) # noqa q = q.filter(role.deleted_at == None) # noqa q = q.filter(perm.deleted_at == None) # noqa return [stringify(i) for (i,) in q.all()]
def fetch(): out_path = os.path.dirname(__file__) out_path = os.path.join(out_path, 'fingerprints', 'data', 'types.yml') fh = urlopen(CSV_URL) types = {} for row in unicodecsv.DictReader(fh): name = stringify(row.get('Name')) abbr = stringify(row.get('Abbreviation')) if name is None or abbr is None: continue if name in types and types[name] != abbr: print(name, types[name], abbr) types[name] = abbr # print abbr, name with open(out_path, 'w') as fh: yaml.safe_dump({'types': types}, fh, indent=2, allow_unicode=True, canonical=False, default_flow_style=False)
def to_dict(self): data = self.to_dict_dates() data['category'] = self.DEFAULT if self.category in self.CATEGORIES: data['category'] = self.category data['kind'] = 'casefile' if self.casefile else 'source' data.update({ 'id': stringify(self.id), 'collection_id': stringify(self.id), 'foreign_id': self.foreign_id, 'creator_id': stringify(self.creator_id), 'team_id': self.team_id, 'label': self.label, 'summary': self.summary, 'publisher': self.publisher, 'publisher_url': self.publisher_url, 'info_url': self.info_url, 'data_url': self.data_url, 'casefile': self.casefile, 'secret': self.secret }) return data
def update(self): """Apply the outcome of the result to the document.""" doc = self.document if self.status == self.STATUS_SUCCESS: doc.status = Document.STATUS_SUCCESS doc.error_message = None else: doc.status = Document.STATUS_FAIL doc.error_message = stringify(self.error_message) schema = model['Document'] for flag, name in self.SCHEMATA: if flag in self.flags: schema = model[name] doc.schema = schema.name doc.foreign_id = safe_string(self.id) doc.content_hash = self.checksum or doc.content_hash doc.pdf_version = self.pdf_checksum doc.title = self.title or doc.meta.get('title') doc.file_name = self.file_name or doc.meta.get('file_name') doc.file_size = self.size or doc.meta.get('file_size') doc.summary = self.summary or doc.meta.get('summary') doc.author = self.author or doc.meta.get('author') doc.generator = self.generator or doc.meta.get('generator') doc.mime_type = self.mime_type or doc.meta.get('mime_type') doc.encoding = self.encoding or doc.meta.get('encoding') doc.date = self.date or doc.meta.get('date') doc.authored_at = self.created_at or doc.meta.get('authored_at') doc.modified_at = self.modified_at or doc.meta.get('modified_at') doc.published_at = self.published_at or doc.meta.get('published_at') doc.message_id = self.message_id or doc.meta.get('message_id') doc.in_reply_to = ensure_list(self.in_reply_to) doc.columns = list(self.columns.keys()) doc.body_raw = self.body_html doc.body_text = self.body_text doc.headers = self.headers for kw in self.keywords: doc.add_keyword(safe_string(kw)) for lang in self.languages: doc.add_language(safe_string(lang)) db.session.flush() collector = DocumentTagCollector(doc, 'ingestors') for entity in self.entities: collector.emit(entity, DocumentTag.TYPE_PERSON) for email in self.emails: collector.emit(email, DocumentTag.TYPE_EMAIL) collector.save()
def to_dict(self): data = self.to_dict_dates() data.update({ 'id': stringify(self.id), 'type': self.type, 'name': self.name, 'label': self.label, 'email': self.email, 'api_key': self.api_key, 'is_admin': self.is_admin, 'is_muted': self.is_muted, 'has_password': self.has_password, # 'notified_at': self.notified_at }) return data
def _load_metadata(): """Unpack the common, pre-defined metadata for all the uploaded files.""" try: meta = json.loads(request.form.get('meta', '{}')) except Exception as ex: raise BadRequest(str(ex)) validate_data(meta, DocumentCreateSchema) foreign_id = stringify(meta.get('foreign_id')) if not len(request.files) and foreign_id is None: raise BadRequest(response=jsonify({ 'status': 'error', 'message': 'Directories need to have a foreign_id' }, status=400)) return meta, foreign_id
def to_proxy(self): if self.text is not None: proxy = model.make_entity(self.SCHEMA_PAGE) proxy.make_id('record', self.id) proxy.set('document', self.document_id) proxy.set('index', self.index) proxy.set('bodyText', stringify(self.text)) return proxy else: proxy = model.make_entity(self.SCHEMA_ROW) proxy.make_id('record', self.id) proxy.set('table', self.document_id) proxy.set('index', self.index) if self.data is not None: # sort values by columns values = [ self.data.get(k) for k in self.document.meta.get('columns') ] proxy.set('cells', registry.json.pack(values)) return proxy
def create(cls, data, role_id): alert = cls() alert.role_id = role_id alert.query = stringify(data.get('query')) alert.update() return alert