def __init__(self, *args, **kwargs): super(DownloadPlenumMeetingProtocolsProcessor, self).__init__(*args, **kwargs) self._schema["fields"] = [ { "name": "kns_plenum_session_id", "type": "integer", "description": "primary key from kns_plenumsession table" }, { "name": "protocol_object_name", "type": "string", "description": "storage object name containing the downloaded protocol" }, { "name": "protocol_extension", "type": "string", "description": "file extension of the downloaded protocol" }, ] self._schema["primaryKey"] = ["kns_plenum_session_id"] self._all_object_names = [] self.s3 = object_storage.get_s3() self.extension_regex = re.compile("[.](.{3,8})$")
def _process(self, datapackage, resources): self._schema["fields"] = [{"name": "committee_id", "type": "integer"}, {"name": "meeting_id", "type": "integer"}, {"name": "name", "type": "string"} ] self.s3 = object_storage.get_s3() self.existing_rows = db.ExistingRows("committee-meeting-speakers", primary_key="meeting_id") return self._process_filter(datapackage, resources)
def _process(self, datapackage, resources): self._schema["fields"] = [{"name": "committee_id", "type": "integer"}, {"name": "meeting_id", "type": "integer"}, {"name": "name", "type": "string"}, {"name": "role", "type": "string"}, {"name": "additional_information", "type": "string"}, ] self.s3 = object_storage.get_s3() self.existing_rows = db.ExistingRows("committee-meeting-attendees", primary_key="meeting_id") return self._process_filter(datapackage, resources)
def __init__(self, *args, **kwargs): super(DownloadCommitteeMeetingProtocolsProcessor, self).__init__(*args, **kwargs) self._schema["fields"] = [ {"name": "kns_committee_id", "type": "integer", "description": "primary key from kns_committee table"}, {"name": "kns_session_id", "type": "integer", "description": "primary key from kns_committeesession table"}, {"name": "protocol_object_name", "type": "string", "description": "storage object name containing the downloaded protocol"}, {"name": "protocol_extension", "type": "string", "description": "file extension of the downloaded protocol"}, ] self._schema["primaryKey"] = ["kns_session_id"] self._all_object_names = [] self.s3 = object_storage.get_s3()
def __init__(self, *args, **kwargs): super(DownloadMembersProcessor, self).__init__(*args, **kwargs) self._schema["fields"] = [ { "name": "kns_person_id", "type": "integer", "description": "primary key from kns_person table" } ] self._schema["primaryKey"] = ["kns_person_id"] self._all_object_names = [] self.s3 = object_storage.get_s3()
def __init__(self, *args, **kwargs): super(ParseCommitteeMeetingProtocolsProcessor, self).__init__(*args, **kwargs) self._schema["fields"] = [ {"name": "kns_committee_id", "type": "integer", "description": "primary key from kns_committee table"}, {"name": "kns_session_id", "type": "integer", "description": "primary key from kns_committeesession table"}, {"name": "protocol_object_name", "type": "string", "description": "storage object name containing the downloaded protocol"}, {"name": "protocol_extension", "type": "string", "description": "file extension of the downloaded protocol"}, {"name": "text_object_name", "type": "string", "description": "storage object name containing the parsed protocol text"}, {"name": "parts_object_name", "type": "string", "description": "storage object name containing the parsed protocol csv"},] self._schema["primaryKey"] = ["kns_session_id"] self.s3 = object_storage.get_s3()
def get_pipeline_schema(pipeline_spec, pipeline_id): bucket = pipeline_spec if pipeline_id == 'committee_meeting_protocols_parsed': object_name = "table-schemas/committee_meeting_protocols_parsed.json" else: object_name = "table-schemas/{}.json".format(pipeline_id) s3 = object_storage.get_s3() if object_storage.exists(s3, bucket, object_name): return json.loads(object_storage.read(s3, bucket, object_name)) else: logging.warning("Missing local table schema, trying from remote") url = "https://minio.oknesset.org/{}/{}".format(bucket, object_name) res = requests.get(url) res.raise_for_status() return res.json()
def filter_resources(datapackage, resources, parameters, stats): tables = [] for resource_descriptor, resource_data in zip(datapackage["resources"], resources): schema = resource_descriptor["schema"] stats[resource_descriptor["name"]] = 0 tables.append(_get_schema_table(resource_descriptor["name"], schema["fields"], schema["primaryKey"])) yield filter_resource(resource_descriptor, resource_data, stats) html = """<html><head><meta charset="UTF-8"></head><body>{tables}</body></html>""".format(tables="".join(tables)) save_schema = parameters.get("save-schema", DEFAULT_SAVE_SCHEMA) if save_schema: save_schema_html = DEFAULT_SAVE_SCHEMA.format(table_name=datapackage["name"], ext="html") save_schema_json = DEFAULT_SAVE_SCHEMA.format(table_name=datapackage["name"], ext="json") s3 = object_storage.get_s3() object_storage.write(s3, parameters["bucket"], save_schema_html, html, public_bucket=True) object_storage.write(s3, parameters["bucket"], save_schema_json, json.dumps(datapackage["resources"], indent=2, ensure_ascii=False), public_bucket=True)
def __init__(self, *args, **kwargs): super(Processor, self).__init__(*args, **kwargs) self._db_table = None self.s3 = object_storage.get_s3()