Exemple #1
0
    def update(self, data):
        props = (
            "title",
            "summary",
            "author",
            "crawler",
            "source_url",
            "file_name",
            "mime_type",
            "headers",
            "date",
            "authored_at",
            "modified_at",
            "published_at",
            "retrieved_at",
            "languages",
            "countries",
            "keywords",
        )
        countries = ensure_list(data.get("countries", []))
        data["countries"] = [registry.country.clean(val) for val in countries]
        languages = ensure_list(data.get("languages", []))
        data["languages"] = [registry.language.clean(val) for val in languages]
        for prop in props:
            text = data.get(prop, self.meta.get(prop))
            if isinstance(text, list):
                self.meta[prop] = [sanitize_text(txt) for txt in text]
            else:
                self.meta[prop] = sanitize_text(text)

            if self.meta.get(prop) is None:
                self.meta.pop(prop, None)

        flag_modified(self, "meta")
Exemple #2
0
    def __init__(self, args, authz, limit=None, max_limit=MAX_PAGE):
        if not isinstance(args, MultiDict):
            args = OrderedMultiDict(args)
        self.args = args
        self.authz = authz
        self.offset = max(0, self.getint("offset", 0))
        if limit is None:
            limit = min(max_limit, max(0, self.getint("limit", 20)))
        self.limit = limit
        self.text = sanitize_text(self.get("q"))
        self.prefix = sanitize_text(self.get("prefix"))

        # Disable or enable query caching
        self.cache = self.getbool("cache", settings.CACHE)
Exemple #3
0
    def __init__(self, model, data, key_prefix=None):
        data = dict(data)
        properties = ensure_dict(data.pop('properties', {}))
        self.schema = model.get(data.pop('schema', None))
        if self.schema is None:
            raise InvalidData(gettext('No schema for entity.'))
        self.id = sanitize_text(data.pop('id', None))
        self.key_prefix = sanitize_text(key_prefix)
        self.context = data
        self._properties = {}
        self._size = 0

        if is_mapping(properties):
            for key, value in properties.items():
                self.add(key, value, cleaned=True, quiet=True)
Exemple #4
0
 def getlist(self, name, default=None):
     values = []
     for value in self.args.getlist(name):
         value = sanitize_text(value, encoding="utf-8")
         if value is not None:
             values.append(value)
     return values or (default or [])
Exemple #5
0
    def clean(self, text, format=None, **kwargs):
        """The classic: date parsing, every which way."""
        # handle date/datetime before converting to text.
        date = self._clean_datetime(text)
        if date is not None:
            return date

        text = sanitize_text(text)
        if text is None:
            return

        if format is not None:
            # parse with a specified format
            try:
                obj = datetime.strptime(text, format)
                text = obj.date().isoformat()
                if self.MONTH_FORMATS.search(format) is None:
                    text = text[:4]
                elif self.DAY_FORMATS.search(format) is None:
                    text = text[:7]
                return text
            except Exception:
                return None

        return self._clean_text(text)
Exemple #6
0
 def clean(self, text: Any, **kwargs) -> Optional[str]:
     """Create a clean version of a value of the type, suitable for storage
     in an entity proxy."""
     text = sanitize_text(text)
     if text is None:
         return None
     return self.clean_text(text, **kwargs)
Exemple #7
0
    def __init__(self, model, data, key_prefix=None, cleaned=True):
        data = dict(data)
        properties = data.pop("properties", {})
        if not cleaned:
            properties = ensure_dict(properties)
        self.schema = model.get(data.pop("schema", None))
        if self.schema is None:
            raise InvalidData(gettext("No schema for entity."))
        self.key_prefix = key_prefix
        self.id = data.pop("id", None)
        if not cleaned:
            self.id = sanitize_text(self.id)
        self.context = data
        self._properties = {}
        self._size = 0

        for key, value in properties.items():
            if key not in self.schema.properties:
                continue
            if not cleaned:
                self.add(key, value, cleaned=cleaned, quiet=True)
            else:
                values = set(value)
                self._properties[key] = values
                self._size += sum([len(v) for v in values])
Exemple #8
0
 def items(self):
     for (key, value) in self.args.items(multi=True):
         if key in ("offset", "limit", "next_limit"):
             continue
         value = sanitize_text(value, encoding="utf-8")
         if value is not None:
             yield key, value
Exemple #9
0
 def items(self):
     for (key, value) in self.args.items(multi=True):
         if key == 'offset':
             continue
         value = sanitize_text(value, encoding='utf-8')
         if value is not None:
             yield key, value
Exemple #10
0
 def read_csv(cls, fh: Iterable[str]) -> Generator[Record, None, None]:
     for row in DictReader(fh, skipinitialspace=True):
         data: Record = {}
         for ref, ref_value in row.items():
             value = sanitize_text(ref_value)
             if value is not None:
                 data[ref] = value
         yield data
 def pick(self, values):
     values = [sanitize_text(v) for v in ensure_list(values)]
     values = [v for v in values if v is not None]
     if not len(values):
         return None
     if 1 == len(values):
         return values[0]
     return setmedian(values)
Exemple #12
0
 def records(self):
     """Iterate through the table applying filters on-the-go."""
     for url in self.urls:
         for row in self.read_csv_url(url):
             data = {}
             for ref in self.query.refs:
                 data[ref] = sanitize_text(row.get(ref))
             if self.check_filters(row):
                 yield data
Exemple #13
0
 def ingest(self, file_path, entity):
     entity.schema = model.get('PlainText')
     text = self.read_file_decoded(entity, file_path)
     text = sanitize_text(text)
     entity.set('bodyText', text)
     try:
         for card in vobject.readComponents(text):
             self.ingest_card(entity, card)
     except (ParseError, UnicodeDecodeError) as err:
         raise ProcessingException('Cannot parse vcard: %s' % err) from err
Exemple #14
0
    def update(self, data):
        props = ('title', 'summary', 'author', 'crawler', 'source_url',
                 'file_name', 'mime_type', 'headers', 'date', 'authored_at',
                 'modified_at', 'published_at', 'retrieved_at', 'languages',
                 'countries', 'keywords')
        for prop in props:
            text = data.get(prop, self.meta.get(prop))
            self.meta[prop] = sanitize_text(text)

        flag_modified(self, 'meta')
Exemple #15
0
 def validate(self, email, **kwargs):
     """Check to see if this is a valid email address."""
     # TODO: adopt email.utils.parseaddr
     email = sanitize_text(email)
     if email is None:
         return False
     if not self.EMAIL_REGEX.match(email):
         return False
     _, domain = email.rsplit('@', 1)
     return self.domains.validate(domain, **kwargs)
Exemple #16
0
 def validate(self, value: str) -> bool:
     """Check to see if this is a valid email address."""
     # TODO: adopt email.utils.parseaddr
     email = sanitize_text(value)
     if email is None or not self.REGEX.match(email):
         return False
     _, domain = email.rsplit("@", 1)
     if len(domain) < 4 or "." not in domain:
         return False
     return True
Exemple #17
0
 def clean(
     self,
     raw: Any,
     fuzzy: bool = False,
     format: Optional[str] = None,
     proxy: Optional["EntityProxy"] = None,
 ) -> Optional[str]:
     if not isinstance(raw, str):
         return self.pack(raw)
     else:
         return sanitize_text(raw)
Exemple #18
0
 def validate(self, obj, **kwargs):
     """Check if a thing is a valid domain name."""
     text = sanitize_text(obj)
     if text is None:
         return False
     if '.' not in text:
         return False
     if '@' in text or ':' in text:
         return False
     if len(text) < 4:
         return False
     return True
Exemple #19
0
 def clean(
     self,
     raw: Any,
     fuzzy: bool = False,
     format: Optional[str] = None,
     proxy: Optional["EntityProxy"] = None,
 ) -> Optional[str]:
     """Create a clean version of a value of the type, suitable for storage
     in an entity proxy."""
     text = sanitize_text(raw)
     if text is None:
         return None
     return self.clean_text(text, fuzzy=fuzzy, format=format, proxy=proxy)
Exemple #20
0
 def make_sheet(self, title: str, headers: List[str]) -> Worksheet:
     sheet = self.workbook.create_sheet(title=title)
     sheet.freeze_panes = "A2"
     sheet.sheet_properties.filterMode = True
     cells = []
     for header in headers:
         header_ = sanitize_text(header)
         cell = WriteOnlyCell(sheet, value=header_)
         cell.font = self.HEADER_FONT
         cell.fill = self.HEADER_FILL
         cells.append(cell)
     sheet.append(cells)
     return sheet
Exemple #21
0
    def update(self, data):
        props = ('title', 'summary', 'author', 'crawler', 'source_url',
                 'file_name', 'mime_type', 'headers', 'date', 'authored_at',
                 'modified_at', 'published_at', 'retrieved_at', 'languages',
                 'countries', 'keywords')
        data['countries'] = ensure_list(data.get('countries', []))
        data['countries'] = [
            registry.country.clean(val) for val in data['countries']
        ]  # noqa
        data['languages'] = ensure_list(data.get('languages', []))
        data['languages'] = [
            registry.language.clean(val) for val in data['languages']
        ]  # noqa
        for prop in props:
            text = data.get(prop, self.meta.get(prop))
            if isinstance(text, list):
                self.meta[prop] = [sanitize_text(txt) for txt in text]
            else:
                self.meta[prop] = sanitize_text(text)

            if self.meta.get(prop) is None:
                self.meta.pop(prop, None)

        flag_modified(self, 'meta')
Exemple #22
0
 def records(self):
     """Compose the actual query and return an iterator of ``Record``."""
     mapping = [(r, self.get_column(r).name) for r in self.query.refs]
     q = self.compose_query()
     log.info("Query: %s", q)
     rp = self.engine.execute(q)
     while True:
         rows = rp.fetchmany(size=DATA_PAGE)
         if not len(rows):
             break
         for row in rows:
             data = {}
             for ref, name in mapping:
                 data[ref] = sanitize_text(row[name])
             yield data
Exemple #23
0
    def __init__(
        self,
        model: "Model",
        data: Dict[str, Any],
        key_prefix: Optional[str] = None,
        cleaned: bool = True,
    ):
        data = dict(data or {})
        properties = data.pop("properties", {})
        if not cleaned:
            properties = ensure_dict(properties)

        #: The schema definition for this entity, which implies the properties
        #: That can be set on it.
        schema = model.get(data.pop("schema", None))
        if schema is None:
            raise InvalidData(gettext("No schema for entity."))
        self.schema = schema

        #: When using :meth:`~make_id` to generate a natural key for this entity,
        #: the prefix will be added to the ID as a salt to make it easier to keep
        #: IDs unique across datasets. This is somewhat redundant following the
        #: introduction of :class:`~followthemoney.namespace.Namespace`.
        self.key_prefix = key_prefix

        #: A unique identifier for this entity, usually a hashed natural key,
        #: a UUID, or a very simple slug. Can be signed using a
        #: :class:`~followthemoney.namespace.Namespace`.
        self.id = data.pop("id", None)
        if not cleaned:
            self.id = sanitize_text(self.id)

        #: If the input dictionary for the entity proxy contains fields other
        #: than ``id``, ``schema`` or ``properties``, they will be kept in here
        #: and re-added upon serialization.
        self.context = data
        self._properties: Dict[str, Set[str]] = {}
        self._size = 0

        for key, value in properties.items():
            if key not in self.schema.properties:
                continue
            if not cleaned:
                self.add(key, value, cleaned=cleaned, quiet=True)
            else:
                values = set(value)
                self._properties[key] = values
                self._size += sum([len(v) for v in values])
Exemple #24
0
    def save(
        cls,
        collection,
        parent=None,
        foreign_id=None,
        content_hash=None,
        meta=None,
        role_id=None,
    ):
        """Try and find a document by various criteria."""
        foreign_id = sanitize_text(foreign_id)

        q = cls.all()
        q = q.filter(Document.collection_id == collection.id)

        if parent is not None:
            q = q.filter(Document.parent_id == parent.id)
        if foreign_id is not None:
            q = q.filter(Document.foreign_id == foreign_id)
        elif content_hash is not None:
            q = q.filter(Document.content_hash == content_hash)
        else:
            raise ValueError("No unique criterion for document.")

        document = q.first()
        if document is None:
            document = cls()
            document.schema = cls.SCHEMA
            document.collection_id = collection.id
            document.role_id = role_id

        if parent is not None:
            document.parent_id = parent.id

        if foreign_id is not None:
            document.foreign_id = foreign_id

        document.content_hash = content_hash
        if content_hash is None:
            document.schema = cls.SCHEMA_FOLDER

        if meta is not None:
            document.update(meta)

        db.session.add(document)
        return document
Exemple #25
0
 def emit_row_dicts(self, table, rows, headers=None):
     csv_path = self.make_work_file(table.id)
     row_count = 0
     with open(csv_path, 'w', encoding='utf-8') as fp:
         csv_writer = csv.writer(fp, dialect='unix')
         for row in rows:
             if headers is None:
                 headers = list(row.keys())
             values = [sanitize_text(row.get(h)) for h in headers]
             csv_writer.writerow(values)
             self.manager.emit_text_fragment(table, values, row_count)
             row_count += 1
             if row_count > 0 and row_count % 1000 == 0:
                 log.info("Table emit [%s]: %s...", table, row_count)
     if row_count > 0:
         csv_hash = self.manager.store(csv_path, mime_type=CSV)
         table.set('csvHash', csv_hash)
     table.set('rowCount', row_count + 1)
     table.set('columns', registry.json.pack(headers))
Exemple #26
0
 def emit_row_dicts(self, table, rows, headers=None):
     csv_path = self.make_work_file(table.id)
     row_count = 0
     with open(csv_path, "w", encoding=self.DEFAULT_ENCODING) as fp:
         csv_writer = csv.writer(fp, dialect="unix")
         for row in rows:
             if headers is None:
                 headers = list(row.keys())
             values = [sanitize_text(row.get(h)) for h in headers]
             length = sum((len(v) for v in values if v is not None))
             if length == 0:
                 continue
             csv_writer.writerow(values)
             self.manager.emit_text_fragment(table, values, row_count)
             row_count += 1
             if row_count > 0 and row_count % 1000 == 0:
                 log.info("Table emit [%s]: %s...", table, row_count)
     if row_count > 0:
         csv_hash = self.manager.store(csv_path, mime_type=CSV)
         table.set("csvHash", csv_hash)
     table.set("rowCount", row_count + 1)
     table.set("columns", registry.json.pack(headers))
Exemple #27
0
    def __init__(self, query, data, prop):
        self.query = query
        data = deepcopy(data)
        self.data = data
        self.prop = prop
        self.name = prop.name
        self.type = prop.type

        self.refs = keys_values(data, "column", "columns")
        self.literals = keys_values(data, "literal", "literals")
        self.join = data.pop("join", None)
        self.split = data.pop("split", None)
        self.entity = data.pop("entity", None)
        self.required = data.pop("required", False)

        self.template = sanitize_text(data.pop("template", None))
        self.replacements = {}
        if self.template is not None:
            # this is hacky, trying to generate refs from template
            for ref in self.FORMAT_PATTERN.findall(self.template):
                self.refs.append(ref)
                self.replacements["{{%s}}" % ref] = ref
    def __init__(self, query: "QueryMapping", data: Dict[str, Any],
                 prop: Property) -> None:
        self.query = query
        data = deepcopy(data)
        self.prop = prop

        self.refs = cast(List[str], keys_values(data, "column", "columns"))
        self.join = cast(Optional[str], data.pop("join", None))
        self.split = cast(Optional[str], data.pop("split", None))
        self.entity = stringify(data.pop("entity", None))
        self.format = stringify(data.pop("format", None))
        self.fuzzy = as_bool(data.pop("fuzzy", False))
        self.required = as_bool(data.pop("required", False))
        self.literals = cast(List[str], keys_values(data, "literal",
                                                    "literals"))

        self.template = sanitize_text(data.pop("template", None))
        self.replacements: Dict[str, str] = {}
        if self.template is not None:
            # this is hacky, trying to generate refs from template
            for ref in self.FORMAT_PATTERN.findall(self.template):
                self.refs.append(ref)
                self.replacements["{{%s}}" % ref] = ref
Exemple #29
0
 def ingest_component(self, entity, idx, comp):
     if comp.name == "VCALENDAR":
         entity.add("generator", comp.get("PRODID"))
     if comp.name == "VEVENT":
         event = self.manager.make_entity("Event")
         self.manager.apply_context(event, entity)
         uid = sanitize_text(comp.get("UID"))
         if uid is not None:
             event.make_id(uid)
         else:
             event.make_id(entity.id, idx)
         event.add("proof", entity)
         event.add("name", comp.get("SUMMARY"))
         event.add("description", comp.get("DESCRIPTION"))
         event.add("location", comp.get("LOCATION"))
         event.add("sourceUrl", comp.get("URL"))
         event.add("startDate", cal_date(comp.get("DTSTART")))
         event.add("endDate", cal_date(comp.get("DTEND")))
         event.add("date", cal_date(comp.get("CREATED")))
         event.add("modifiedAt", cal_date(comp.get("LAST-MODIFIED")))
         event.add("organizer", self.address_entity(comp.get("ORGANIZER")))
         for attendee in ensure_list(comp.get("ATTENDEE")):
             event.add("involved", self.address_entity(attendee))
         self.manager.emit_entity(event, fragment=idx)
Exemple #30
0
 def validate(self, text, **kwargs):
     text = sanitize_text(text)
     try:
         return iban.validate(text)
     except ValidationError:
         return False