def update(self, data): props = ( "title", "summary", "author", "crawler", "source_url", "file_name", "mime_type", "headers", "date", "authored_at", "modified_at", "published_at", "retrieved_at", "languages", "countries", "keywords", ) countries = ensure_list(data.get("countries", [])) data["countries"] = [registry.country.clean(val) for val in countries] languages = ensure_list(data.get("languages", [])) data["languages"] = [registry.language.clean(val) for val in languages] for prop in props: text = data.get(prop, self.meta.get(prop)) if isinstance(text, list): self.meta[prop] = [sanitize_text(txt) for txt in text] else: self.meta[prop] = sanitize_text(text) if self.meta.get(prop) is None: self.meta.pop(prop, None) flag_modified(self, "meta")
def __init__(self, args, authz, limit=None, max_limit=MAX_PAGE): if not isinstance(args, MultiDict): args = OrderedMultiDict(args) self.args = args self.authz = authz self.offset = max(0, self.getint("offset", 0)) if limit is None: limit = min(max_limit, max(0, self.getint("limit", 20))) self.limit = limit self.text = sanitize_text(self.get("q")) self.prefix = sanitize_text(self.get("prefix")) # Disable or enable query caching self.cache = self.getbool("cache", settings.CACHE)
def __init__(self, model, data, key_prefix=None): data = dict(data) properties = ensure_dict(data.pop('properties', {})) self.schema = model.get(data.pop('schema', None)) if self.schema is None: raise InvalidData(gettext('No schema for entity.')) self.id = sanitize_text(data.pop('id', None)) self.key_prefix = sanitize_text(key_prefix) self.context = data self._properties = {} self._size = 0 if is_mapping(properties): for key, value in properties.items(): self.add(key, value, cleaned=True, quiet=True)
def getlist(self, name, default=None): values = [] for value in self.args.getlist(name): value = sanitize_text(value, encoding="utf-8") if value is not None: values.append(value) return values or (default or [])
def clean(self, text, format=None, **kwargs): """The classic: date parsing, every which way.""" # handle date/datetime before converting to text. date = self._clean_datetime(text) if date is not None: return date text = sanitize_text(text) if text is None: return if format is not None: # parse with a specified format try: obj = datetime.strptime(text, format) text = obj.date().isoformat() if self.MONTH_FORMATS.search(format) is None: text = text[:4] elif self.DAY_FORMATS.search(format) is None: text = text[:7] return text except Exception: return None return self._clean_text(text)
def clean(self, text: Any, **kwargs) -> Optional[str]: """Create a clean version of a value of the type, suitable for storage in an entity proxy.""" text = sanitize_text(text) if text is None: return None return self.clean_text(text, **kwargs)
def __init__(self, model, data, key_prefix=None, cleaned=True): data = dict(data) properties = data.pop("properties", {}) if not cleaned: properties = ensure_dict(properties) self.schema = model.get(data.pop("schema", None)) if self.schema is None: raise InvalidData(gettext("No schema for entity.")) self.key_prefix = key_prefix self.id = data.pop("id", None) if not cleaned: self.id = sanitize_text(self.id) self.context = data self._properties = {} self._size = 0 for key, value in properties.items(): if key not in self.schema.properties: continue if not cleaned: self.add(key, value, cleaned=cleaned, quiet=True) else: values = set(value) self._properties[key] = values self._size += sum([len(v) for v in values])
def items(self): for (key, value) in self.args.items(multi=True): if key in ("offset", "limit", "next_limit"): continue value = sanitize_text(value, encoding="utf-8") if value is not None: yield key, value
def items(self): for (key, value) in self.args.items(multi=True): if key == 'offset': continue value = sanitize_text(value, encoding='utf-8') if value is not None: yield key, value
def read_csv(cls, fh: Iterable[str]) -> Generator[Record, None, None]: for row in DictReader(fh, skipinitialspace=True): data: Record = {} for ref, ref_value in row.items(): value = sanitize_text(ref_value) if value is not None: data[ref] = value yield data
def pick(self, values): values = [sanitize_text(v) for v in ensure_list(values)] values = [v for v in values if v is not None] if not len(values): return None if 1 == len(values): return values[0] return setmedian(values)
def records(self): """Iterate through the table applying filters on-the-go.""" for url in self.urls: for row in self.read_csv_url(url): data = {} for ref in self.query.refs: data[ref] = sanitize_text(row.get(ref)) if self.check_filters(row): yield data
def ingest(self, file_path, entity): entity.schema = model.get('PlainText') text = self.read_file_decoded(entity, file_path) text = sanitize_text(text) entity.set('bodyText', text) try: for card in vobject.readComponents(text): self.ingest_card(entity, card) except (ParseError, UnicodeDecodeError) as err: raise ProcessingException('Cannot parse vcard: %s' % err) from err
def update(self, data): props = ('title', 'summary', 'author', 'crawler', 'source_url', 'file_name', 'mime_type', 'headers', 'date', 'authored_at', 'modified_at', 'published_at', 'retrieved_at', 'languages', 'countries', 'keywords') for prop in props: text = data.get(prop, self.meta.get(prop)) self.meta[prop] = sanitize_text(text) flag_modified(self, 'meta')
def validate(self, email, **kwargs): """Check to see if this is a valid email address.""" # TODO: adopt email.utils.parseaddr email = sanitize_text(email) if email is None: return False if not self.EMAIL_REGEX.match(email): return False _, domain = email.rsplit('@', 1) return self.domains.validate(domain, **kwargs)
def validate(self, value: str) -> bool: """Check to see if this is a valid email address.""" # TODO: adopt email.utils.parseaddr email = sanitize_text(value) if email is None or not self.REGEX.match(email): return False _, domain = email.rsplit("@", 1) if len(domain) < 4 or "." not in domain: return False return True
def clean( self, raw: Any, fuzzy: bool = False, format: Optional[str] = None, proxy: Optional["EntityProxy"] = None, ) -> Optional[str]: if not isinstance(raw, str): return self.pack(raw) else: return sanitize_text(raw)
def validate(self, obj, **kwargs): """Check if a thing is a valid domain name.""" text = sanitize_text(obj) if text is None: return False if '.' not in text: return False if '@' in text or ':' in text: return False if len(text) < 4: return False return True
def clean( self, raw: Any, fuzzy: bool = False, format: Optional[str] = None, proxy: Optional["EntityProxy"] = None, ) -> Optional[str]: """Create a clean version of a value of the type, suitable for storage in an entity proxy.""" text = sanitize_text(raw) if text is None: return None return self.clean_text(text, fuzzy=fuzzy, format=format, proxy=proxy)
def make_sheet(self, title: str, headers: List[str]) -> Worksheet: sheet = self.workbook.create_sheet(title=title) sheet.freeze_panes = "A2" sheet.sheet_properties.filterMode = True cells = [] for header in headers: header_ = sanitize_text(header) cell = WriteOnlyCell(sheet, value=header_) cell.font = self.HEADER_FONT cell.fill = self.HEADER_FILL cells.append(cell) sheet.append(cells) return sheet
def update(self, data): props = ('title', 'summary', 'author', 'crawler', 'source_url', 'file_name', 'mime_type', 'headers', 'date', 'authored_at', 'modified_at', 'published_at', 'retrieved_at', 'languages', 'countries', 'keywords') data['countries'] = ensure_list(data.get('countries', [])) data['countries'] = [ registry.country.clean(val) for val in data['countries'] ] # noqa data['languages'] = ensure_list(data.get('languages', [])) data['languages'] = [ registry.language.clean(val) for val in data['languages'] ] # noqa for prop in props: text = data.get(prop, self.meta.get(prop)) if isinstance(text, list): self.meta[prop] = [sanitize_text(txt) for txt in text] else: self.meta[prop] = sanitize_text(text) if self.meta.get(prop) is None: self.meta.pop(prop, None) flag_modified(self, 'meta')
def records(self): """Compose the actual query and return an iterator of ``Record``.""" mapping = [(r, self.get_column(r).name) for r in self.query.refs] q = self.compose_query() log.info("Query: %s", q) rp = self.engine.execute(q) while True: rows = rp.fetchmany(size=DATA_PAGE) if not len(rows): break for row in rows: data = {} for ref, name in mapping: data[ref] = sanitize_text(row[name]) yield data
def __init__( self, model: "Model", data: Dict[str, Any], key_prefix: Optional[str] = None, cleaned: bool = True, ): data = dict(data or {}) properties = data.pop("properties", {}) if not cleaned: properties = ensure_dict(properties) #: The schema definition for this entity, which implies the properties #: That can be set on it. schema = model.get(data.pop("schema", None)) if schema is None: raise InvalidData(gettext("No schema for entity.")) self.schema = schema #: When using :meth:`~make_id` to generate a natural key for this entity, #: the prefix will be added to the ID as a salt to make it easier to keep #: IDs unique across datasets. This is somewhat redundant following the #: introduction of :class:`~followthemoney.namespace.Namespace`. self.key_prefix = key_prefix #: A unique identifier for this entity, usually a hashed natural key, #: a UUID, or a very simple slug. Can be signed using a #: :class:`~followthemoney.namespace.Namespace`. self.id = data.pop("id", None) if not cleaned: self.id = sanitize_text(self.id) #: If the input dictionary for the entity proxy contains fields other #: than ``id``, ``schema`` or ``properties``, they will be kept in here #: and re-added upon serialization. self.context = data self._properties: Dict[str, Set[str]] = {} self._size = 0 for key, value in properties.items(): if key not in self.schema.properties: continue if not cleaned: self.add(key, value, cleaned=cleaned, quiet=True) else: values = set(value) self._properties[key] = values self._size += sum([len(v) for v in values])
def save( cls, collection, parent=None, foreign_id=None, content_hash=None, meta=None, role_id=None, ): """Try and find a document by various criteria.""" foreign_id = sanitize_text(foreign_id) q = cls.all() q = q.filter(Document.collection_id == collection.id) if parent is not None: q = q.filter(Document.parent_id == parent.id) if foreign_id is not None: q = q.filter(Document.foreign_id == foreign_id) elif content_hash is not None: q = q.filter(Document.content_hash == content_hash) else: raise ValueError("No unique criterion for document.") document = q.first() if document is None: document = cls() document.schema = cls.SCHEMA document.collection_id = collection.id document.role_id = role_id if parent is not None: document.parent_id = parent.id if foreign_id is not None: document.foreign_id = foreign_id document.content_hash = content_hash if content_hash is None: document.schema = cls.SCHEMA_FOLDER if meta is not None: document.update(meta) db.session.add(document) return document
def emit_row_dicts(self, table, rows, headers=None): csv_path = self.make_work_file(table.id) row_count = 0 with open(csv_path, 'w', encoding='utf-8') as fp: csv_writer = csv.writer(fp, dialect='unix') for row in rows: if headers is None: headers = list(row.keys()) values = [sanitize_text(row.get(h)) for h in headers] csv_writer.writerow(values) self.manager.emit_text_fragment(table, values, row_count) row_count += 1 if row_count > 0 and row_count % 1000 == 0: log.info("Table emit [%s]: %s...", table, row_count) if row_count > 0: csv_hash = self.manager.store(csv_path, mime_type=CSV) table.set('csvHash', csv_hash) table.set('rowCount', row_count + 1) table.set('columns', registry.json.pack(headers))
def emit_row_dicts(self, table, rows, headers=None): csv_path = self.make_work_file(table.id) row_count = 0 with open(csv_path, "w", encoding=self.DEFAULT_ENCODING) as fp: csv_writer = csv.writer(fp, dialect="unix") for row in rows: if headers is None: headers = list(row.keys()) values = [sanitize_text(row.get(h)) for h in headers] length = sum((len(v) for v in values if v is not None)) if length == 0: continue csv_writer.writerow(values) self.manager.emit_text_fragment(table, values, row_count) row_count += 1 if row_count > 0 and row_count % 1000 == 0: log.info("Table emit [%s]: %s...", table, row_count) if row_count > 0: csv_hash = self.manager.store(csv_path, mime_type=CSV) table.set("csvHash", csv_hash) table.set("rowCount", row_count + 1) table.set("columns", registry.json.pack(headers))
def __init__(self, query, data, prop): self.query = query data = deepcopy(data) self.data = data self.prop = prop self.name = prop.name self.type = prop.type self.refs = keys_values(data, "column", "columns") self.literals = keys_values(data, "literal", "literals") self.join = data.pop("join", None) self.split = data.pop("split", None) self.entity = data.pop("entity", None) self.required = data.pop("required", False) self.template = sanitize_text(data.pop("template", None)) self.replacements = {} if self.template is not None: # this is hacky, trying to generate refs from template for ref in self.FORMAT_PATTERN.findall(self.template): self.refs.append(ref) self.replacements["{{%s}}" % ref] = ref
def __init__(self, query: "QueryMapping", data: Dict[str, Any], prop: Property) -> None: self.query = query data = deepcopy(data) self.prop = prop self.refs = cast(List[str], keys_values(data, "column", "columns")) self.join = cast(Optional[str], data.pop("join", None)) self.split = cast(Optional[str], data.pop("split", None)) self.entity = stringify(data.pop("entity", None)) self.format = stringify(data.pop("format", None)) self.fuzzy = as_bool(data.pop("fuzzy", False)) self.required = as_bool(data.pop("required", False)) self.literals = cast(List[str], keys_values(data, "literal", "literals")) self.template = sanitize_text(data.pop("template", None)) self.replacements: Dict[str, str] = {} if self.template is not None: # this is hacky, trying to generate refs from template for ref in self.FORMAT_PATTERN.findall(self.template): self.refs.append(ref) self.replacements["{{%s}}" % ref] = ref
def ingest_component(self, entity, idx, comp): if comp.name == "VCALENDAR": entity.add("generator", comp.get("PRODID")) if comp.name == "VEVENT": event = self.manager.make_entity("Event") self.manager.apply_context(event, entity) uid = sanitize_text(comp.get("UID")) if uid is not None: event.make_id(uid) else: event.make_id(entity.id, idx) event.add("proof", entity) event.add("name", comp.get("SUMMARY")) event.add("description", comp.get("DESCRIPTION")) event.add("location", comp.get("LOCATION")) event.add("sourceUrl", comp.get("URL")) event.add("startDate", cal_date(comp.get("DTSTART"))) event.add("endDate", cal_date(comp.get("DTEND"))) event.add("date", cal_date(comp.get("CREATED"))) event.add("modifiedAt", cal_date(comp.get("LAST-MODIFIED"))) event.add("organizer", self.address_entity(comp.get("ORGANIZER"))) for attendee in ensure_list(comp.get("ATTENDEE")): event.add("involved", self.address_entity(attendee)) self.manager.emit_entity(event, fragment=idx)
def validate(self, text, **kwargs): text = sanitize_text(text) try: return iban.validate(text) except ValidationError: return False