Ejemplo n.º 1
0
 def _save_url(self,
               url,
               bucket,
               object_name,
               num_retries,
               seconds_between_retries,
               retry_num=1):
     try:
         res = self._reuqests_get(url)
     except RequestException as e:
         if retry_num < num_retries:
             logging.exception(e)
             logging.info(
                 "retry {} / {}, waiting {} seconds before retrying...".
                 format(retry_num, num_retries, seconds_between_retries))
             time.sleep(seconds_between_retries)
             return self._save_url(url, bucket, object_name, num_retries,
                                   seconds_between_retries, retry_num + 1)
         else:
             raise
     if res.status_code == 200:
         object_storage.write(self.s3, bucket, object_name, res.content)
         return True
     else:
         return False
Ejemplo n.º 2
0
 def _save_schema_html(self, save_schema):
     object_name = save_schema.format(table_name=self._tablename,
                                      ext="html")
     bucket = self._parameters["schemas-bucket"]
     object_storage.write(self.s3,
                          bucket,
                          object_name,
                          self._get_schema_html(),
                          public_bucket=True)
Ejemplo n.º 3
0
 def _save_schema_json(self, save_schema):
     object_name = save_schema.format(table_name=self._tablename,
                                      ext="json")
     bucket = self._parameters["schemas-bucket"]
     object_storage.write(self.s3,
                          bucket,
                          object_name,
                          json.dumps(self._schema, indent=2),
                          public_bucket=True)
 def _parse_doc_protocol(self, committee_id, meeting_id, bucket, protocol_object_name, parts_object_name, text_object_name):
     logging.info("parsing doc protocol {} --> {}, {}".format(protocol_object_name, parts_object_name, text_object_name))
     with object_storage.temp_download(self.s3, bucket, protocol_object_name) as protocol_filename:
         try:
             with CommitteeMeetingProtocol.get_from_filename(protocol_filename) as protocol:
                 object_storage.write(self.s3, bucket, text_object_name, protocol.text, public_bucket=True)
                 self._parse_protocol_parts(bucket, parts_object_name, protocol)
         except (
                 AntiwordException,  # see https://github.com/hasadna/knesset-data-pipelines/issues/15
                 subprocess.SubprocessError,
                 xml.etree.ElementTree.ParseError  # see https://github.com/hasadna/knesset-data-pipelines/issues/32
         ):
             logging.exception("committee {} meeting {}: failed to parse doc file, skipping".format(committee_id, meeting_id))
             return False
     return True
 def _save_url(self, url, bucket, object_name, num_retries, seconds_between_retries, retry_num=1):
     try:
         res = self._reuqests_get(url)
     except RequestException as e:
         if retry_num < num_retries:
             logging.exception(e)
             logging.info("retry {} / {}, waiting {} seconds before retrying...".format(retry_num, num_retries, seconds_between_retries))
             time.sleep(seconds_between_retries)
             return self._save_url(url, bucket, object_name, num_retries, seconds_between_retries, retry_num+1)
         else:
             raise
     if res.status_code == 200:
         object_storage.write(self.s3, bucket, object_name, res.content, public_bucket=True)
         return True
     else:
         return False
Ejemplo n.º 6
0
def filter_resources(datapackage, resources, parameters, stats):
    tables = []
    for resource_descriptor, resource_data in zip(datapackage["resources"], resources):
        schema = resource_descriptor["schema"]
        stats[resource_descriptor["name"]] = 0
        tables.append(_get_schema_table(resource_descriptor["name"], schema["fields"], schema["primaryKey"]))

        yield filter_resource(resource_descriptor, resource_data, stats)

    html = """<html><head><meta charset="UTF-8"></head><body>{tables}</body></html>""".format(tables="".join(tables))

    save_schema = parameters.get("save-schema", DEFAULT_SAVE_SCHEMA)
    if save_schema:
        save_schema_html = DEFAULT_SAVE_SCHEMA.format(table_name=datapackage["name"], ext="html")
        save_schema_json = DEFAULT_SAVE_SCHEMA.format(table_name=datapackage["name"], ext="json")

        s3 = object_storage.get_s3()
        object_storage.write(s3, parameters["bucket"], save_schema_html, html, public_bucket=True)
        object_storage.write(s3, parameters["bucket"], save_schema_json, json.dumps(datapackage["resources"], indent=2, ensure_ascii=False), public_bucket=True)