Ejemplo n.º 1
0
 def __init__(self, *args, **kwargs):
     super(DownloadPlenumMeetingProtocolsProcessor,
           self).__init__(*args, **kwargs)
     self._schema["fields"] = [
         {
             "name": "kns_plenum_session_id",
             "type": "integer",
             "description": "primary key from kns_plenumsession table"
         },
         {
             "name":
             "protocol_object_name",
             "type":
             "string",
             "description":
             "storage object name containing the downloaded protocol"
         },
         {
             "name": "protocol_extension",
             "type": "string",
             "description": "file extension of the downloaded protocol"
         },
     ]
     self._schema["primaryKey"] = ["kns_plenum_session_id"]
     self._all_object_names = []
     self.s3 = object_storage.get_s3()
     self.extension_regex = re.compile("[.](.{3,8})$")
 def _process(self, datapackage, resources):
     self._schema["fields"] = [{"name": "committee_id", "type": "integer"},
                               {"name": "meeting_id", "type": "integer"},
                               {"name": "name", "type": "string"} ]
     self.s3 = object_storage.get_s3()
     self.existing_rows = db.ExistingRows("committee-meeting-speakers", primary_key="meeting_id")
     return self._process_filter(datapackage, resources)
 def _process(self, datapackage, resources):
     self._schema["fields"] = [{"name": "committee_id", "type": "integer"},
                               {"name": "meeting_id", "type": "integer"},
                               {"name": "name", "type": "string"},
                               {"name": "role", "type": "string"},
                               {"name": "additional_information", "type": "string"}, ]
     self.s3 = object_storage.get_s3()
     self.existing_rows = db.ExistingRows("committee-meeting-attendees", primary_key="meeting_id")
     return self._process_filter(datapackage, resources)
 def __init__(self, *args, **kwargs):
     super(DownloadCommitteeMeetingProtocolsProcessor, self).__init__(*args, **kwargs)
     self._schema["fields"] = [
         {"name": "kns_committee_id", "type": "integer", "description": "primary key from kns_committee table"},
         {"name": "kns_session_id", "type": "integer", "description": "primary key from kns_committeesession table"},
         {"name": "protocol_object_name", "type": "string", "description": "storage object name containing the downloaded protocol"},
         {"name": "protocol_extension", "type": "string", "description": "file extension of the downloaded protocol"},
     ]
     self._schema["primaryKey"] = ["kns_session_id"]
     self._all_object_names = []
     self.s3 = object_storage.get_s3()
 def __init__(self, *args, **kwargs):
     super(DownloadMembersProcessor, self).__init__(*args, **kwargs)
     self._schema["fields"] = [
         {
             "name": "kns_person_id", "type": "integer",
             "description": "primary key from kns_person table"
         }
     ]
     self._schema["primaryKey"] = ["kns_person_id"]
     self._all_object_names = []
     self.s3 = object_storage.get_s3()
 def __init__(self, *args, **kwargs):
     super(ParseCommitteeMeetingProtocolsProcessor, self).__init__(*args, **kwargs)
     self._schema["fields"] = [
         {"name": "kns_committee_id", "type": "integer", "description": "primary key from kns_committee table"},
         {"name": "kns_session_id", "type": "integer", "description": "primary key from kns_committeesession table"},
         {"name": "protocol_object_name", "type": "string", "description": "storage object name containing the downloaded protocol"},
         {"name": "protocol_extension", "type": "string", "description": "file extension of the downloaded protocol"},
         {"name": "text_object_name", "type": "string", "description": "storage object name containing the parsed protocol text"},
         {"name": "parts_object_name", "type": "string", "description": "storage object name containing the parsed protocol csv"},]
     self._schema["primaryKey"] = ["kns_session_id"]
     self.s3 = object_storage.get_s3()
Ejemplo n.º 7
0
def get_pipeline_schema(pipeline_spec, pipeline_id):
    bucket = pipeline_spec
    if pipeline_id == 'committee_meeting_protocols_parsed':
        object_name = "table-schemas/committee_meeting_protocols_parsed.json"
    else:
        object_name = "table-schemas/{}.json".format(pipeline_id)
    s3 = object_storage.get_s3()
    if object_storage.exists(s3, bucket, object_name):
        return json.loads(object_storage.read(s3, bucket, object_name))
    else:
        logging.warning("Missing local table schema, trying from remote")
        url = "https://minio.oknesset.org/{}/{}".format(bucket, object_name)
        res = requests.get(url)
        res.raise_for_status()
        return res.json()
Ejemplo n.º 8
0
def get_pipeline_schema(pipeline_spec, pipeline_id):
    bucket = pipeline_spec
    if pipeline_id == 'committee_meeting_protocols_parsed':
        object_name = "table-schemas/committee_meeting_protocols_parsed.json"
    else:
        object_name = "table-schemas/{}.json".format(pipeline_id)
    s3 = object_storage.get_s3()
    if object_storage.exists(s3, bucket, object_name):
        return json.loads(object_storage.read(s3, bucket, object_name))
    else:
        logging.warning("Missing local table schema, trying from remote")
        url = "https://minio.oknesset.org/{}/{}".format(bucket, object_name)
        res = requests.get(url)
        res.raise_for_status()
        return res.json()
Ejemplo n.º 9
0
def filter_resources(datapackage, resources, parameters, stats):
    tables = []
    for resource_descriptor, resource_data in zip(datapackage["resources"], resources):
        schema = resource_descriptor["schema"]
        stats[resource_descriptor["name"]] = 0
        tables.append(_get_schema_table(resource_descriptor["name"], schema["fields"], schema["primaryKey"]))

        yield filter_resource(resource_descriptor, resource_data, stats)

    html = """<html><head><meta charset="UTF-8"></head><body>{tables}</body></html>""".format(tables="".join(tables))

    save_schema = parameters.get("save-schema", DEFAULT_SAVE_SCHEMA)
    if save_schema:
        save_schema_html = DEFAULT_SAVE_SCHEMA.format(table_name=datapackage["name"], ext="html")
        save_schema_json = DEFAULT_SAVE_SCHEMA.format(table_name=datapackage["name"], ext="json")

        s3 = object_storage.get_s3()
        object_storage.write(s3, parameters["bucket"], save_schema_html, html, public_bucket=True)
        object_storage.write(s3, parameters["bucket"], save_schema_json, json.dumps(datapackage["resources"], indent=2, ensure_ascii=False), public_bucket=True)
Ejemplo n.º 10
0
 def __init__(self, *args, **kwargs):
     super(Processor, self).__init__(*args, **kwargs)
     self._db_table = None
     self.s3 = object_storage.get_s3()